def main(): model, encoder_model, decoder_model = create_models(300, 512, 300) model.load_weights(sys.argv[1]) ft_en = FastText('embeddings/wiki.en.bin') ft_tl = FastText('embeddings/wiki.tl.bin') start_seq = ft_en.get_numpy_vector(SOS, normalized=True).reshape(1, 1, -1) chars = '.,?!()' while True: input_sentence = input('Input Tagalog: ').lower() #'kamusta ka ?' for c in chars: input_sentence = input_sentence.replace(c, ' ' + c + ' ') print('Embedding...') input_seq = input_sentence.lower().split() aaa = np.zeros((1, 15, 300), dtype='float32') for i, w in enumerate(input_seq): aaa[0, i] = ft_tl.get_numpy_vector(w, normalized=True) #input_seq = [ft_tl.get_numpy_vector(i, normalized=True) for i in input_seq] #input_seq = np.stack(input_seq).reshape(1, -1, 300) input_seq = aaa print(input_seq) print('Translating...') decoded_sentence = decode_sequence(input_seq, encoder_model, decoder_model, ft_en, start_seq) print('-') print('Input sentence:', input_sentence) print('Decoded sentence:', decoded_sentence)
def main(): model = FastText('model_text8.bin') target_words = [ 'granada', 'python', 'harmony', 'mafia', 'yoga', 'goth', 'cyberpunk', 'nasa', 'japan', 'boolean', 'foodball', 'algorithm', 'china', 'usa', 'internet', 'harvard', 'earth', 'horse', 'angel', 'rock' ] for t_word in target_words: # get embedding target_word_embedding = model.get_numpy_vector(t_word) print('Target word:', t_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(t_word, k=15) # init array nn_word_embedding = np.zeros(shape=(15, 128)) i = 0 for word, similarity in closest_words: # get each word embedding nn_word_embedding[i] = model.get_numpy_vector(word) #print('Word:', word, 'Vec:', nn_word_embedding[i]) i = i + 1 # kmeans #print(nn_word_embedding.shape) #print(closest_words) cluster_model = KMeans(n_clusters=3, init='k-means++') prediction = cluster_model.fit_predict(nn_word_embedding) print(prediction) j = 0 for word in closest_words: print('Word:', word[0], '- Cluster #%d' % (prediction[j] + 1)) j = j + 1
class FeatureGenerator: def __init__(self, fastext_path): self.fasttext = FastText(fastext_path) def generate_record(self, tuple): tr = self.fasttext.get_numpy_vector(tuple[0]) si = self.fasttext.get_numpy_vector(tuple[1]) lm = self.fasttext.get_numpy_vector(tuple[2]) #return numpy.concatenate((tr, lm)) #return numpy.concatenate((tr, si, lm)) return numpy.concatenate((tr, si, lm, lm - tr)) #return numpy.concatenate((si, lm - tr, tr - lm)) def generate(self, values): return numpy.array([self.generate_record(value) for value in values])
def use_pyfasttext_model(): # OK # 训练模型可以使用fasttext命令行工具进行(../doc/fastText_train.png),也可以使用本文件使用的pyfasttext包训练。 """ # OK: 1. pyfasttext包训练的模型的导入 model = FastText("../data/lxw_model_sg_pyfasttext.bin") print(model["先生"]) # type(model["先生"]): <class 'array.array'> print(model.get_numpy_vector("先生")) # type: <class 'numpy.ndarray'> print(model["刘晓伟"]) # OOV print(model.get_numpy_vector("刘晓伟")) print(model["陈贺"]) # OOV print(model.get_numpy_vector("陈贺")) model = FastText("../data/lxw_model_cbow_pyfasttext.bin") print(model["先生"]) print(model.get_numpy_vector("先生")) # type: <class 'numpy.ndarray'> print(model["刘晓伟"]) # OOV print(model.get_numpy_vector("刘晓伟")) print(model["陈贺"]) # OOV print(model.get_numpy_vector("陈贺")) # NOTE: 简单的测试发现, 两个不同的模型针对同一个OOV计算得到的向量是一样的(与fasttext包的情况相同,详情可参见NO_2_use_fasttext_model), 非OOV的向量是不一样的。 """ # OK: 2. fasttext命令行工具训练出来的模型的导入 model = FastText("../data/880w_fasttext_skip_gram.bin") print(model["先生"]) # type(model["先生"]): <class 'array.array'> print(model.get_numpy_vector("先生")) # print(model["刘晓伟"]) # OK. OOV # print(model["陈贺"]) # OK. OOV # Sentence and text vectors. sentence_vec = model.get_numpy_sentence_vector("刘晓伟 是 个 好人") print(sentence_vec) """
def make_embedding_matrix(word_index, fname): model = FastText(os.path.join('embeddings', fname)) embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM), dtype='float32') for word, i in word_index.items(): embedding_matrix[i] = model.get_numpy_vector(word, normalized=True) return embedding_matrix
class FastTextEmbedding(Embedding): def __init__(self, binfile, normalize = False): self.file = binfile self.vdim = -1 self.normalize = normalize def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self def getVector(self, word): return self.ftmodel.get_numpy_vector(word, normalized = self.normalize) def search(self, q, topk = 4): raise NotImplementedError() def wordForVec(self, v): word, sim = self.ftmodel.words_for_vector(v)[0] return word, sim def containsWord(self, word): return True def vocabulary(self): return self.ftmodel.words def dim(self): return self.vdim
class FastTextEmbeddings: def __init__(self, path): self.fasttext = FastText(path) def generate(self, sentence): return [self.fasttext.get_numpy_vector(word) for word in sentence] def size(self): return 300
def build_w2v(relevant_tokens, model_file='wiki.cy.bin'): # using this library because it's more memory friendly for python :) from pyfasttext import FastText model = FastText(model_file) w2v = {} for token in relevant_tokens: vec = model.get_numpy_vector(token) w2v[token] = vec return w2v
def main(): model = FastText('model_text8.bin') target_word = 'dog' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) print('Embedding shape:', target_word_embedding.shape) print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity)
def __init__(self, doc_catgy, n_vocab, emb_dim, out_channels, filter_size, word2index, pre_trained_embedding, multi_label): self.in_channels = 1 self.out_channels = out_channels self.row_dim = emb_dim self.hidden_dim = 512 ## fixed self.doc_catgy = doc_catgy self.n_classes = len(doc_catgy) self.n_vocab = n_vocab self.filter_size = filter_size self.word2index = word2index self.mutli_label = multi_label self.le = None if self.mutli_label == 1: self.le = MultiLabelBinarizer(classes=[i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])],sparse_output=False) elif self.mutli_label == 0: self.le = LabelEncoder() self.le.fit([i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1])]) self.look_up_table = None self.pre_trained_embedding = pre_trained_embedding super(XMLCnn, self).__init__() self.to_gpu() if not self.pre_trained_embedding is None: model = FastText(self.pre_trained_embedding) dim = len(model['a']) n_vocab = len(self.word2index.keys()) self.look_up_table = self.xp.zeros((n_vocab, dim),dtype=np.float32) for word,index in tqdm(self.word2index.items()): try: self.look_up_table[index] = chainer.cuda.to_gpu(model.get_numpy_vector(word)) except: self.xp.random.seed(index) self.look_up_table[index][:] = self.xp.random.uniform(-0.25, 0.25, dim) self.set_seed_random(123) with self.init_scope(): if self.look_up_table is None: self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=linear_init) else: self.embedding=L.EmbedID(n_vocab, self.row_dim, ignore_label=-1,initialW=self.look_up_table) self.conv1 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[0],self.row_dim), stride=2,initialW=linear_init) self.conv2 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[1],self.row_dim), stride=2,initialW=linear_init) self.conv3 = L.Convolution2D(self.in_channels,self.out_channels,(filter_size[2],self.row_dim), stride=2,initialW=linear_init) self.l1=L.Linear(in_size = None, out_size = self.hidden_dim, initialW=linear_init) self.l2=L.Linear(in_size = self.hidden_dim, out_size = self.n_classes,initialW=linear_init) self.to_gpu()
class SVM: def __init__( self, TFIDF): # TODO: plz give me the model or using global singleton self.model = FastText('/home/yee0/Atos/wiki.en.bin') self.TFIDF = TFIDF self.tfidf_dict, self.words = TFIDF.get_tfidf() def set_tfidf(self, TFIDF): self.TFIDF = TFIDF self.tfidf_dict, self.words = TFIDF.get_tfidf() def to_feature(self, X): return [self.vec(self.TFIDF.predict_field(post), post) for post in X] def vec(self, field_index, post): v = np.zeros(300) post = set(post) # make unique for pl in post: if pl != '' and pl in self.words: v += self.model.get_numpy_vector( pl) * self.tfidf_dict[field_index][pl] return v def train(self, X, y): # 建立 SVC 模型 self.svc = svm.SVC() svc_fit = self.svc.fit(self.to_feature(X), y) def predict(self, post): return self.svc.predict(self.to_feature([post])) def save_model(self): with open('steevebot/save/svm.pickle', 'wb') as f: pickle.dump(self.svc, f) def restore_model(self): with open('steevebot/save/svm.pickle', 'rb') as f: self.svc = pickle.load(f) # In[ ]: # In[ ]:
class FastTextWrapper(EmbeddingWrapper): """ Contains the KeyedVectors object, name of the file from which the embeddings were loaded as well as its md5. """ def __init__(self, fasttext_path): super(FastTextWrapper, self).__init__(fasttext_path) self._fasttext = FastText(fasttext_path) self._word_output_len = self.get_numpy_vector("checklen").shape[0] def __len__(self): return self._word_output_len def get_numpy_vector(self, word): return self._fasttext.get_numpy_vector(word) def emb_obj(self): """ Returns FastText object """ return self._fasttext
class FastTextEmbedding(Embedding): def __init__(self, binfile, normalize=False): self.file = binfile self.vdim = -1 self.normalize = normalize def load(self): print('Loading fasttext model.') self.ftmodel = FastText() self.ftmodel.load_model(self.file) self.vdim = len(self.ftmodel['is']) print('Finished loading fasttext model.') return self def getVector(self, word): return self.ftmodel.get_numpy_vector(word, normalized=self.normalize) def wordForVec(self, v): word, sim = self.ftmodel.words_for_vector(v)[0] return word, sim def nearest_neighbors(self, word, n=200): tuples = ftmodel.nearest_neighbors(word, n) return tuples def nearest_neighbors_by_vector(self, word, n=200): tuples = self.ftmodel.words_for_vector(v, n) return tuples def containsWord(self, word, explicit=False): if explicit: return word in vocabulary() return True def vocabulary(self): return self.ftmodel.words def dim(self): return self.vdim
def main(): texts_tl, texts_en = data.parse_corpora('corpus') word_index_tl, word_index_en, encoder_input_data, decoder_input_data, decoder_target_data = data.preprocess( texts_en, texts_tl) embedding_dim = 300 latent_dim = 512 model, encoder_model, decoder_model = create_models( embedding_dim, latent_dim, embedding_dim) model.load_weights(sys.argv[1]) indexes = np.random.randint(0, len(texts_tl), 100) ft_model = FastText(os.path.join('embeddings', 'wiki.en.bin')) start_seq = ft_model.get_numpy_vector(data.SOS, normalized=True).reshape(1, 1, -1) embedding_weights = np.load('embedding-weights.npz') e_tl = embedding_weights['tl'].astype('float32') for seq_index in indexes: # Take one sequence (part of the training set) # for trying out decoding. sentence = texts_tl[seq_index] input_seq = encoder_input_data[seq_index] input_seq = np.take(e_tl, input_seq, axis=0).reshape(1, -1, 300) print(input_seq) #input_seq = sentence.split()[1:-1] #print(input_seq) #input_seq = np.stack(list(ft_model.get_numpy_vector(i, normalized=True) for i in input_seq)).reshape(1, -1, 300) #input_seq = np.stack(list(map(ft_model.get_numpy_vector, input_seq))).reshape(1, -1, 300) #print(input_seq) #print(input_seq.shape) decoded_sentence = decode_sequence(input_seq, encoder_model, decoder_model, ft_model, start_seq) print('-') print('Input sentence:', texts_tl[seq_index]) print('Decoded sentence:', decoded_sentence)
from pyfasttext import FastText ft = FastText('model.bin') print ft.get_numpy_vector(u'you')
def main(): model = FastText('model_text8.bin') target_word = 'deep' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) print('Embedding shape:', target_word_embedding.shape) print('Embedding:', target_word_embedding[0:15], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'president' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'self' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'insult' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'general' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'inclined' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'property' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'international' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'many' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'imprisoned' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'branches' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'communist' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'france' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'strict' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'earthly' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) terget_word = "zero" # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'feminism' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'ideas' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'theory' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3) target_word = 'writings' # get embedding target_word_embedding = model.get_numpy_vector(target_word) print('Target word:', target_word) #print('Embedding shape:', target_word_embedding.shape) #print('Embedding:', target_word_embedding[0:10], '...') # find closest words closest_words = model.nearest_neighbors(target_word, k=15) closest_word_embeddings = [] numw = 0 for word, similarity in closest_words: print('Word:', word, 'similarity:', similarity) closest_word_embeddings.append(model.get_numpy_vector(word)) kmeans = cluster.KMeans(n_clusters=3) kmeans.fit(closest_word_embeddings) labels = kmeans.labels_ print ('Cluster id labels for inputted data') print (labels) cluster1 = [] cluster2 = [] cluster3 = [] for i in range(0,15): if labels[i] == 0: cluster1.append(closest_words[i][0]) if labels[i] == 1: cluster2.append(closest_words[i][0]) if labels[i] == 2: cluster3.append(closest_words[i][0]) print("cluster #1 : ", cluster1) print("cluster #2 : ", cluster2) print("cluster #3 : ", cluster3)
class NB_Implement(): def __init__(self): start_time = time.time() # self.model = FastText("../data/input/models/sg_pyfasttext.bin") # DEBUG self.model = FastText( "../data/input/models/880w_fasttext_skip_gram.bin") end_time = time.time() print(f"Loading word vector model cost: {end_time - start_time:.2f}s") # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape # OK self.vocab_size = self.model.nwords self.vector_size = self.model.args.get("dim") print( f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}" ) # self.vector_size:200, self.vocab_size: 925242 # 句子的表示形式: {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "matrix": matrix} self.sentence_vec_type = "avg" def set_sent_vec_type(self, sentence_vec_type): assert self.sentence_vec_type in [ "avg", "matrix", "fasttext" ], "self.sentence_vec_type must be in ['avg', 'fasttext', 'matrix']" self.sentence_vec_type = sentence_vec_type def gen_sentence_vec(self, sentence): """ :param sentence: :return: """ sentence = sentence.strip() if self.sentence_vec_type == "fasttext": return self.model.get_numpy_sentence_vector(sentence) word_list = [word for word in sentence.split(" ")] word_len = len(word_list) if self.sentence_vec_type == "matrix": sentence_matrix = np.empty(word_len, dtype=list) for idx, word in enumerate(word_list): sentence_matrix[idx] = self.model.get_numpy_vector(word) return sentence_matrix else: # self.sentence_vec_type == "avg": sentence_vector = np.zeros(self.vector_size) # <ndarray> # print(f"type(sentence_vector): {type(sentence_vector)}") for idx, word in enumerate(word_list): # print(f"type(self.model.get_numpy_vector(word)): {type(self.model.get_numpy_vector(word))}") # <ndarry> sentence_vector += self.model.get_numpy_vector(word) return sentence_vector / len(word_list) def gen_train_val_data(self): """ 构造训练, 验证数据 """ X_train = list() y_train = list() for line in open("../data/input/training_set.txt"): line = line.strip().split("\t") sent_vector = self.gen_sentence_vec(line[-1]) X_train.append(sent_vector) y_train.append(int(line[0])) X_val = list() y_val = list() for line in open("../data/input/validation_set.txt"): line = line.strip().split("\t") sent_vector = self.gen_sentence_vec(line[-1]) X_val.append(sent_vector) y_val.append(int(line[0])) return np.array(X_train), np.array(y_train), np.array(X_val), np.array( y_val), def train_bayes(self, X_train, y_train): """ 基于Naive Bayes的分类算法 """ from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(X_train, y_train) joblib.dump(model, "../data/output/models/bayes_model") def evaluate_bayes(self, model_path, X_val, y_val): """ 基于Naive Bayes分类器的预测 """ model = joblib.load(model_path) y_val = list(y_val) correct = 0 """ y_predict = list() for sent_vec in X_val: # sent_vec.shape: (self.vector_size,) predicted = model.predict(sent_vec.reshape(1, -1)) # sent_vec.reshape(1, -1).shape: (1, self.vector_size) y_predict.append(predicted[0]) """ y_predict = model.predict(X_val) print(f"len(y_predict): {len(y_predict)}, len(y_val): {len(y_val)}") assert len(y_predict) == len( y_val ), "Unexpected Error: len(y_predict) != len(y_val), but it should be" for idx in range(len(y_predict)): if int(y_predict[idx]) == int(y_val[idx]): correct += 1 score = correct / len(y_predict) print(f"Bayes Classification Accuray:{score}") return score def predict_bayes(self, model_path): """ 实际应用测试 """ model = joblib.load(model_path) sentence = "这件 衣服 真的 太 好看 了 ! 好想 买 啊 " sent_vec = np.array(self.gen_sentence_vec(sentence)).reshape(1, -1) print(f"'{sentence}': {model.predict(sent_vec)}") # 1: 负向 sentence = "这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了" sent_vec = np.array(self.gen_sentence_vec(sentence)).reshape(1, -1) print(f"'{sentence}': {model.predict(sent_vec)}") # 1: 负向
# Update states states_value = [h, c] return decoded_sentence #indexes = np.random.randint(0, len(input_texts), 100) indexes = np.random.randint(0, len(valid_input_texts), 100) for seq_index in indexes: # Take one sequence (part of the training set) # for trying out decoding. #text = input_texts[seq_index] text = valid_input_texts[seq_index] words = text.split() encoder_input_data = np.zeros((1, max_encoder_seq_length, embedding_dims), dtype='float32') for t, word in enumerate(words): encoder_input_data[0, t, :] = filModel.get_numpy_vector(word, normalized=True) #print("decodeding",word) input_seq = encoder_input_data decoded_sentence = decode_sequence(input_seq) print('-') print('Input sentence:', input_texts[seq_index]) print('Decoded sentence:', decoded_sentence)
# Update the target sequence (of length 1). target_seq = np.zeros((1, 1, embedding_dims)) target_seq[0, 0, :] = engModel.get_numpy_vector(sampled_word, normalized=True) # Update states states_value = [h, c] return decoded_sentence #indexes = np.random.randint(0, len(input_texts), 100) indexes = np.random.randint(0, len(valid_input_texts), 100) for seq_index in indexes: # Take one sequence (part of the training set) # for trying out decoding. #text = input_texts[seq_index] text = valid_input_texts[seq_index] words = text.split() encoder_input_data = np.zeros((1, max_encoder_seq_length, embedding_dims),dtype='float32') for t, word in enumerate(words): encoder_input_data[0,t,:] = filModel.get_numpy_vector(word, normalized=True) #print("decodeding",word) input_seq = encoder_input_data decoded_sentence = decode_sequence(input_seq) print('-') print('Input sentence:', input_texts[seq_index]) print('Decoded sentence:', decoded_sentence)
class Preprocessing: def __init__(self): start_time = time.time() # self.model = FastText("../data/input/models/sg_pyfasttext.bin") # DEBUG self.model = FastText( "../data/input/models/880w_fasttext_skip_gram.bin") end_time = time.time() print(f"Loading word vector model cost: {end_time - start_time:.2f}s") # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape # OK self.vocab_size = self.model.nwords self.vector_size = self.model.args.get("dim") # self.vector_size:200, self.vocab_size: 925242 print( f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}" ) # 句子的表示形式: # {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "concatenate": 向量拼接和补齐, "matrix": 矩阵} self.sentence_vec_type = "matrix" self.MAX_SENT_LEN = 70 # DEBUG: 超参数. self.get_sent_max_length() # 对于"concatenate": self.MAX_SENT_LEN = 30, 取其他不同值的结果: 100: 50.22%, 80: 50.23%, 70: 50.33%, 60: 55.92%, 50: 69.11%, 40: 68.91%, 36: 69.34%, 30: 69.22%, 20: 69.17%, 10: 67.07% # 对于"matrix": self.MAX_SENT_LEN = 70, 取其他不同值的结果: TODO: @classmethod def data_analysis(cls): train_df = pd.read_csv("../data/input/training_set.txt", sep="\t", header=None, names=["label", "sentence"]) val_df = pd.read_csv("../data/input/validation_set.txt", sep="\t", header=None, names=["label", "sentence"]) y_train = train_df["label"] y_val = val_df["label"] sns.set(style="white", context="notebook", palette="deep") # 查看样本数据分布情况(各个label数据是否均匀分布) sns.countplot(y_train) plt.show() sns.countplot(y_val) plt.show() print(y_train.value_counts()) print(y_val.value_counts()) def set_sent_vec_type(self, sentence_vec_type): assert sentence_vec_type in ["avg", "concatenate", "fasttext", "matrix"], \ "sentence_vec_type must be in ['avg', 'fasttext', 'concatenate', 'matrix']" self.sentence_vec_type = sentence_vec_type def get_sent_max_length(self): # NOT_USED sent_len_counter = Counter() max_length = 0 with open("../data/input/training_set.txt") as f: for line in f: content = line.strip().split("\t")[1] content_list = content.split() length = len(content_list) sent_len_counter[length] += 1 if max_length <= length: max_length = length sent_len_counter = sorted(list(sent_len_counter.items()), key=lambda x: x[0]) print(sent_len_counter) # [(31, 1145), (32, 1105), (33, 1017), (34, 938), (35, 839), (36, 830), (37, 775), (38, 737), (39, 720), (40, 643), (41, 575), (42, 584), (43, 517), (44, 547), (45, 514), (46, 514), (47, 480), (48, 460), (49, 470), (50, 444), (51, 484), (52, 432), (53, 462), (54, 495), (55, 487), (56, 500), (57, 496), (58, 489), (59, 419), (60, 387), (61, 348), (62, 265), (63, 222), (64, 153), (65, 127), (66, 103), (67, 67), (68, 34), (69, 21), (70, 22), (71, 8), (72, 6), (73, 4), (74, 10), (75, 2), (76, 4), (77, 2), (78, 1), (79, 2), (80, 4), (81, 2), (82, 3), (83, 1), (84, 5), (86, 4), (87, 3), (88, 3), (89, 2), (90, 2), (91, 3), (92, 5), (93, 2), (94, 4), (96, 1), (97, 5), (98, 1), (99, 2), (100, 2), (101, 2), (102, 1), (103, 2), (104, 2), (105, 2), (106, 5), (107, 3), (108, 2), (109, 3), (110, 4), (111, 1), (112, 2), (113, 3), (114, 1), (116, 1), (119, 3), (679, 1)] return max_length def gen_sentence_vec(self, sentence): """ :param sentence: :return: """ sentence = sentence.strip() if self.sentence_vec_type == "fasttext": return self.model.get_numpy_sentence_vector(sentence) word_list = sentence.split(" ") if self.sentence_vec_type == "concatenate": sentence_vector = self.model.get_numpy_vector(word_list[0]) for word in word_list[1:]: sentence_vector = np.hstack( (sentence_vector, self.model.get_numpy_vector(word))) return sentence_vector # NOTE: 对于concatenate情况, 每个句子的sentence_vector是不一样长的 if self.sentence_vec_type == "matrix": # for Deep Learning. sentence_matrix = [] for word in word_list[ -self. MAX_SENT_LEN:]: # NOTE: 截取后面的应该是要好些(参考https://github.com/lxw0109/SentimentClassification_UMICH_SI650/blob/master/src/LSTM_wo_pretrained_vector.py#L86) sentence_matrix.append(self.model.get_numpy_vector(word)) length = len(sentence_matrix) # 一定成立,因为上面做了切片截取 assert length <= self.MAX_SENT_LEN, "CRITICAL ERROR: len(sentence_matrix) > self.MAX_SENT_LEN." # 参数中的matrix类型为list of ndarray, 返回值的matrix是ndarray of ndarray sentence_matrix = np.pad(sentence_matrix, pad_width=((0, self.MAX_SENT_LEN - length), (0, 0)), mode="constant", constant_values=-1) return sentence_matrix else: # self.sentence_vec_type == "avg": sentence_vector = np.zeros(self.vector_size) # <ndarray> # print(f"type(sentence_vector): {type(sentence_vector)}") for idx, word in enumerate(word_list): # print(f"type(self.model.get_numpy_vector(word)): {type(self.model.get_numpy_vector(word))}") # <ndarray> sentence_vector += self.model.get_numpy_vector(word) return sentence_vector / len(word_list) def gen_train_val_data(self): # 构造训练数据 & 验证数据 train_df = pd.read_csv("../data/input/training_set.txt", sep="\t", header=None, names=["label", "sentence"]) val_df = pd.read_csv("../data/input/validation_set.txt", sep="\t", header=None, names=["label", "sentence"]) # 打乱训练集的顺序. TODO: 不打乱感觉训练出来的模型是有问题的?(好看那句总是预测结果是1?) train_df = train_df.sample(frac=1, random_state=1) # val_df = val_df.sample(frac=1, random_state=1) # 验证集不用打乱 X_train = train_df["sentence"] X_train_vec = list() for sentence in X_train: sent_vector = self.gen_sentence_vec(sentence) X_train_vec.append(sent_vector) y_train = train_df["label"] # <Series> X_val = val_df["sentence"] X_val_vec = list() for sentence in X_val: sent_vector = self.gen_sentence_vec(sentence) X_val_vec.append(sent_vector) y_val = val_df["label"] # <Series> if self.sentence_vec_type == "concatenate": # NOTE: 注意,这里的dtype是必须的,否则dtype默认值是"int32", 词向量所有的数值会被全部转换为0 X_train_vec = sequence.pad_sequences(X_train_vec, maxlen=self.MAX_SENT_LEN * self.vector_size, value=0, dtype=np.float) X_val_vec = sequence.pad_sequences(X_val_vec, maxlen=self.MAX_SENT_LEN * self.vector_size, value=0, dtype=np.float) return np.array(X_train_vec), np.array(X_val_vec), np.array( y_train), np.array(y_val)
def main(): #enc_word2vec = FastText('wiki.tl/wiki.tl.bin') #dec_word2vec = FastText('wiki.en/wiki.en.bin') dec_word2vec = FastText('wiki.en/wiki.en.bin') enc_word2vec = FastText('wiki.tl/wiki.tl.bin') #data_path = 'tgl-eng/tgl.txt' #test_path = 'valid_split' test_path = 'train_split' #data_path = 'health_shortened.tsv' eos = "eos" sos = "sos" #savemodel_filename = 's2s_fasttextloader_batch64_twodata.h5' #training parameters #batch_size = 64 # Batch size for training. batch_size = 64 epochs = 500 # Number of epochs to train for. latent_dim = 512 # Latent dimensionality of the encoding space. word_vec_size = 300 #chkpt_path="checkpoints/weights-improvement-twodata-{epoch:05d}.hdf5" #checkpoint = ModelCheckpoint(chkpt_path, verbose=1) # Define the model that will turn # `encoder_input_data` & `decoder_input_data` into `decoder_target_data` #so far checkpoints/weights_cosine_proximity_combined2-00063.hdf5 seems to work weights_path = 'checkpoints/weights_cosine_proximity_combined2-00063.hdf5' model, encoder_model, decoder_model = build_model(word_vec_size, latent_dim, weights_path) model.summary() # Compile & run training #model.compile(optimizer='rmsprop', loss='mean_squared_error') #model.compile(optimizer='rmsprop', loss='cosine_proximity') #model.compile(optimizer='rmsprop', loss='mean_squared_error') # Note that `decoder_target_data` needs to be one-hot encoded, # rather than sequences of integers like `decoder_input_data`! #num_sentence = 77990 #steps_per_epoch = int(num_sentence//batch_size) decoder_model.summary() def decode_sequence(input_seq, sos, eos): # Encode the input as state vectors. states_value = encoder_model.predict(input_seq) # Generate empty target sequence of length 1. target_seq = np.zeros((1, 1, word_vec_size)) # Populate the first character of target sequence with the start character. #target_seq[0, 0, target_dict[sos]] = 1. ''' create vector for sos ''' target_seq[0, 0, :] = dec_word2vec.get_numpy_vector(sos, normalized=True) # Sampling loop for a batch of sequences # (to simplify, here we assume a batch of size 1). stop_condition = False decoded_sentence = '' while not stop_condition: output_tokens, h, c = decoder_model.predict([target_seq] + states_value) # Sample a token #sampled_token_index = np.argmax(output_tokens[0, -1, :]) #sampled_word = target_rev_dict[sampled_token_index] sampled_word = dec_word2vec.words_for_vector( output_tokens[0, -1, :])[0][0] decoded_sentence += sampled_word + " " # Exit condition: either hit max length # or find stop character. # if sampled_word in [".", "?", "!"] or if (sampled_word == eos or len(decoded_sentence) > max_decoder_seq_length): stop_condition = True if (decoded_sentence.endswith(eos + ' ')): decoded_sentence = decoded_sentence[:-len(eos + ' ')] # Update the target sequence (of length 1). target_seq = np.zeros((1, 1, word_vec_size)) target_seq[0, 0, :] = dec_word2vec.get_numpy_vector(sampled_word, normalized=True) # Update states states_value = [h, c] return decoded_sentence input_texts, target_texts = input2target(test_path, sos, eos) indexes = np.random.randint(0, len(input_texts), 50) #max_encoder_seq_length = max([len(words.split()) for words in input_texts]) #max_decoder_seq_length = max([len(words.split()) for words in target_texts]) max_encoder_seq_length = 130 max_decoder_seq_length = 100 encoder_input_data = np.zeros( (len(input_texts), max_encoder_seq_length, word_vec_size), dtype='float32') ''' for i, text, in enumerate(input_texts): words = text.split() #words.reverse() for t, word in enumerate(words): encoder_input_data[i, t, :] = enc_word2vec.get_numpy_vector(word, normalized=True) ''' while True: input_sentence = input('Enter Filipino sentence: ') print('Input:', input_sentence) input_sentence = input_sentence.replace(",", " ,") input_sentence = input_sentence.replace(".", " .") input_sentence = input_sentence.replace("!", " !") input_sentence = input_sentence.replace("?", " ?") input_sentence = input_sentence.lower() input_words = input_sentence.split() for t, word in enumerate(input_words): encoder_input_data[0, t, :] = enc_word2vec.get_numpy_vector( word, normalized=True) #for seq_index in indexes: # Take one sequence (part of the training set) # for trying out decoding. input_seq = encoder_input_data decoded_sentence = decode_sequence(input_seq, sos, eos) print('-') #print('Input sentence:', input_texts[seq_index]) print('Decoded sentence:', decoded_sentence)
class Embeddings(object): def __init__(self, name, path='./embedding-registry.json', lang='en', extension='vec', use_ELMo=False): self.name = name self.embed_size = 0 self.static_embed_size = 0 self.vocab_size = 0 self.model = {} self.registry = self._load_embedding_registry(path) self.lang = lang self.extension = extension self.embedding_lmdb_path = None if self.registry is not None: self.embedding_lmdb_path = self.registry["embedding-lmdb-path"] self.env = None self.make_embeddings_simple(name) self.static_embed_size = self.embed_size self.bilm = None # below init for using ELMo embeddings self.use_ELMo = use_ELMo if use_ELMo: self.make_ELMo() self.embed_size = ELMo_embed_size + self.embed_size description = self._get_description('elmo-en') self.env_ELMo = None if description: self.embedding_ELMo_cache = os.path.join( description["path-dump"], "cache") # clean possible remaining cache self.clean_ELMo_cache() # create and load a cache in write mode, it will be used only for training self.env_ELMo = lmdb.open(self.embedding_ELMo_cache, map_size=map_size) def __getattr__(self, name): return getattr(self.model, name) def _load_embedding_registry(self, path='./embedding-registry.json'): """ Load the description of available embeddings. Each description provides a name, a file path (used only if necessary) and a embeddings type (to take into account small variation of format) """ registry_json = open(path).read() return json.loads(registry_json) def make_embeddings_simple_in_memory(self, name="fasttext-crawl", hasHeader=True): nbWords = 0 print('loading embeddings...') begin = True description = self._get_description(name) if description is not None: embeddings_path = description["path"] embeddings_type = description["type"] self.lang = description["lang"] print("path:", embeddings_path) if self.extension == 'bin': self.model = FastText(embeddings_path) nbWords = self.model.nwords self.embed_size = 300 else: if embeddings_type == "glove": hasHeader = False with open(embeddings_path) as f: for line in f: line = line.strip() line = line.split(' ') if begin: if hasHeader: # first line gives the nb of words and the embedding size nbWords = int(line[0]) self.embed_size = int(line[1].replace( "\n", "")) begin = False continue else: begin = False word = line[0] #if embeddings_type == 'glove': vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) self.model[word] = vector if nbWords == 0: nbWords = len(self.model) print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") ''' def make_embeddings_fasttext_bin(self, name="wiki.en.bin"): nbWords = 0 print('loading embeddings...') description = self._get_description(name) if description is not None: embeddings_path = description["path"] print("path:", embeddings_path) self.model = load_fasttext_format(embeddings_path) ''' def make_embeddings_lmdb(self, name="fasttext-crawl", hasHeader=True): nbWords = 0 print( '\nCompiling embeddings... (this is done only one time per embeddings at first launch)' ) begin = True description = self._get_description(name) if description is not None: embeddings_path = description["path"] embeddings_type = description["type"] self.lang = description["lang"] print("path:", embeddings_path) if embeddings_type == "glove": hasHeader = False txn = self.env.begin(write=True) batch_size = 1024 i = 0 nb_lines = 0 with open(embeddings_path) as f: for line in f: nb_lines += 1 with open(embeddings_path) as f: #for line in f: for line in tqdm(f, total=nb_lines): line = line.split(' ') if begin: if hasHeader: # first line gives the nb of words and the embedding size nbWords = int(line[0]) self.embed_size = int(line[1].replace("\n", "")) begin = False continue else: begin = False word = line[0] #if embeddings_type == 'glove': vector = np.array( [float(val) for val in line[1:len(line)]], dtype='float32') #else: # vector = np.array([float(val) for val in line[1:len(line)-1]], dtype='float32') if self.embed_size == 0: self.embed_size = len(vector) if len(word.encode( encoding='UTF-8')) < self.env.max_key_size(): txn.put(word.encode(encoding='UTF-8'), _serialize_pickle(vector)) #txn.put(word.encode(encoding='UTF-8'), _serialize_byteio(vector)) i += 1 # commit batch if i % batch_size == 0: txn.commit() txn = self.env.begin(write=True) #if i % batch_size != 0: txn.commit() if nbWords == 0: nbWords = i self.vocab_size = nbWords print('embeddings loaded for', nbWords, "words and", self.embed_size, "dimensions") def make_embeddings_simple(self, name="fasttext-crawl", hasHeader=True): description = self._get_description(name) if description is not None: self.extension = description["format"] if self.embedding_lmdb_path is None or self.embedding_lmdb_path == "None": print( "embedding_lmdb_path is not specified in the embeddings registry, so the embeddings will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name, hasHeader) elif self.extension == "bin": print( "embedding is of format .bin, so it will be loaded in memory..." ) self.make_embeddings_simple_in_memory(name, hasHeader) else: # check if the lmdb database exists envFilePath = os.path.join(self.embedding_lmdb_path, name) if os.path.isdir(envFilePath): description = self._get_description(name) if description is not None: self.lang = description["lang"] # open the database in read mode self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=4) # we need to set self.embed_size and self.vocab_size with self.env.begin() as txn: stats = txn.stat() size = stats['entries'] self.vocab_size = size with self.env.begin() as txn: cursor = txn.cursor() for key, value in cursor: vector = _deserialize_pickle(value) self.embed_size = vector.shape[0] break cursor.close() # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2) else: # create and load the database in write mode self.env = lmdb.open(envFilePath, map_size=map_size) self.make_embeddings_lmdb(name, hasHeader) def make_ELMo(self): # Location of pretrained BiLM for the specified language # TBD check if ELMo language resources are present description = self._get_description('elmo-en') if description is not None: self.lang = description["lang"] vocab_file = description["path-vocab"] options_file = description["path-config"] weight_file = description["path_weights"] print('init ELMo') # Create a Batcher to map text to character ids self.batcher = Batcher(vocab_file, 50) # Build the biLM graph. self.bilm = BidirectionalLanguageModel(options_file, weight_file) # Input placeholders to the biLM. self.character_ids = tf.placeholder('int32', shape=(None, None, 50)) self.embeddings_op = self.bilm(self.character_ids) with tf.variable_scope('', reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.elmo_input = weight_layers('input', self.embeddings_op, l2_coef=0.0) def dump_ELMo_token_embeddings(self, x_train): if not self.use_ELMo: print( "Warning: ELMo embeddings dump requested but embeddings object wrongly initialised" ) return description = self._get_description('elmo-en') if description is not None: print("Building ELMo token dump") self.lang = description["lang"] options_file = description["path-config"] weight_file = description["path_weights"] working_path = description["path-dump"] all_tokens = set(['<S>', '</S>']) for i in range(0, len(x_train)): # as it is training, it is already tokenized tokens = x_train[i] for token in tokens: if token not in all_tokens: all_tokens.add(token) vocab_file = os.path.join(working_path, 'vocab_small.txt') with open(vocab_file, 'w') as fout: fout.write('\n'.join(all_tokens)) tf.reset_default_graph() token_embedding_file = os.path.join(working_path, 'elmo_token_embeddings.hdf5') dump_token_embeddings(vocab_file, options_file, weight_file, token_embedding_file) tf.reset_default_graph() self.batcher_token_dump = TokenBatcher(vocab_file) self.bilm_token_dump = BidirectionalLanguageModel( options_file, weight_file, use_character_inputs=False, embedding_weight_file=token_embedding_file) self.token_ids = tf.placeholder('int32', shape=(None, None)) self.embeddings_op_token_dump = self.bilm_token_dump( self.token_ids) """ with tf.variable_scope('', reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.elmo_input_token_dump = weight_layers('input', self.embeddings_op_token_dump, l2_coef=0.0) """ print("ELMo token dump completed") def get_sentence_vector_only_ELMo(self, token_list): """ Return the ELMo embeddings only for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return # Create batches of data local_token_ids = self.batcher.batch_sentences(token_list) max_size_sentence = local_token_ids[0].shape[0] # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is not None: return elmo_result with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) return elmo_result def get_sentence_vector_with_ELMo(self, token_list): """ Return a concatenation of standard embeddings (e.g. Glove) and ELMo embeddings for a full sentence """ if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return """ # trick to extend the context for short sentences token_list_extended = token_list.copy() #print("token_list_extended before: ", token_list_extended) for i in range(0, len(token_list_extended)): local_list = token_list_extended[i] j = i while len(local_list) <= 5: #print(j, local_list) if j < len(token_list_extended)-1: local_list = local_list + token_list_extended[j+1] else: break j = j + 1 token_list_extended[i] = local_list #print("token_list_extended after: ", token_list_extended) max_size_sentence = 0 for i in range(0, len(token_list)): local_length = len(token_list[i]) if local_length > max_size_sentence: max_size_sentence = local_length """ # Create batches of data local_token_ids = self.batcher.batch_sentences(token_list) max_size_sentence = local_token_ids[0].shape[0] # check lmdb cache elmo_result = self.get_ELMo_lmdb_vector(token_list, max_size_sentence) if elmo_result is None: with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations (2 times as a heavy warm-up) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) elmo_result = sess.run( self.elmo_input['weighted_op'], feed_dict={self.character_ids: local_token_ids}) #cache computation self.cache_ELMo_lmdb_vector(token_list, elmo_result) concatenated_result = np.zeros( (elmo_result.shape[0], max_size_sentence - 2, self.embed_size), dtype=np.float32) for i in range(0, elmo_result.shape[0]): for j in range(0, len(token_list[i])): #if is_int(token_list[i][j]) or is_float(token_list[i][j]): # dummy_result = np.zeros((elmo_result.shape[2]), dtype=np.float32) # concatenated_result[i][j] = np.concatenate((dummy_result, self.get_word_vector(token_list[i][j])), ) #else: concatenated_result[i][j] = np.concatenate( (elmo_result[i][j], self.get_word_vector( token_list[i][j])), ) return concatenated_result def get_sentence_vector_ELMo_with_token_dump(self, token_list): if not self.use_ELMo: print( "Warning: ELMo embeddings requested but embeddings object wrongly initialised" ) return with tf.variable_scope('', reuse=tf.AUTO_REUSE): # the reuse=True scope reuses weights from the whole context self.elmo_input_token_dump = weight_layers( 'input', self.embeddings_op_token_dump, l2_coef=0.0) # Create batches of data local_token_ids = self.batcher_token_dump.batch_sentences(token_list) with tf.Session() as sess: # weird, for this cpu is faster than gpu (1080Ti !) with tf.device("/cpu:0"): # It is necessary to initialize variables once before running inference sess.run(tf.global_variables_initializer()) # Compute ELMo representations elmo_result = sess.run( self.elmo_input_token_dump['weighted_op'], feed_dict={self.token_ids: local_token_ids}) return elmo_result def _get_description(self, name): for emb in self.registry["embeddings"]: if emb["name"] == name: return emb for emb in self.registry["embeddings-contextualized"]: if emb["name"] == name: return emb return None def get_word_vector(self, word): """ Get static embeddings (e.g. glove) for a given token """ if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.env is None: # db not available, the embeddings should be available in memory (normally!) return self.get_word_vector_in_memory(word) try: with self.env.begin() as txn: txn = self.env.begin() vector = txn.get(word.encode(encoding='UTF-8')) if vector: word_vector = _deserialize_pickle(vector) vector = None else: word_vector = np.zeros((self.static_embed_size, ), dtype=np.float32) # alternatively, initialize with random negative values #word_vector = np.random.uniform(low=-0.5, high=0.0, size=(self.embed_size,)) # alternatively use fasttext OOV ngram possibilities (if ngram available) except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env.close() envFilePath = os.path.join(self.embedding_lmdb_path, self.name) self.env = lmdb.open(envFilePath, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_word_vector(word) return word_vector def get_ELMo_lmdb_vector(self, token_list, max_size_sentence): """ Try to get the ELMo embeddings for a sequence cached in LMDB """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None try: ELMo_vector = np.zeros( (len(token_list), max_size_sentence - 2, ELMo_embed_size), dtype='float32') with self.env_ELMo.begin() as txn: for i in range(0, len(token_list)): txn = self.env_ELMo.begin() # get a hash for the token_list the_hash = list_digest(token_list[i]) vector = txn.get(the_hash.encode(encoding='UTF-8')) if vector: # adapt expected shape/padding local_embeddings = _deserialize_pickle(vector) if local_embeddings.shape[0] > max_size_sentence - 2: # squeeze the extra padding space ELMo_vector[ i] = local_embeddings[:max_size_sentence - 2, ] elif local_embeddings.shape[ 0] == max_size_sentence - 2: # bingo~! ELMo_vector[i] = local_embeddings else: # fill the missing space with padding filler = np.zeros((max_size_sentence - (local_embeddings.shape[0] + 2), ELMo_embed_size), dtype='float32') ELMo_vector[i] = np.concatenate( (local_embeddings, filler)) vector = None else: return None except lmdb.Error: # no idea why, but we need to close and reopen the environment to avoid # mdb_txn_begin: MDB_BAD_RSLOT: Invalid reuse of reader locktable slot # when opening new transaction ! self.env_ELMo.close() self.env_ELMo = lmdb.open(embedding_ELMo_cache, readonly=True, max_readers=2048, max_spare_txns=2, lock=False) return self.get_ELMo_lmdb_vector(token_list) return ELMo_vector def cache_ELMo_lmdb_vector(self, token_list, ELMo_vector): """ Cache in LMDB the ELMo embeddings for a given sequence """ if self.env_ELMo is None: # db cache not available, we don't cache ELMo stuff return None txn = self.env_ELMo.begin(write=True) for i in range(0, len(token_list)): # get a hash for the token_list the_hash = list_digest(token_list[i]) txn.put(the_hash.encode(encoding='UTF-8'), _serialize_pickle(ELMo_vector[i])) txn.commit() def clean_ELMo_cache(self): """ Delete ELMo embeddings cache, this takes place normally after the completion of a training """ if self.env_ELMo is None: # db cache not available, nothing to clean return else: for file in os.listdir(self.embedding_ELMo_cache): file_path = os.path.join(self.embedding_ELMo_cache, file) if os.path.isfile(file_path): os.remove(file_path) os.rmdir(self.embedding_ELMo_cache) def get_word_vector_in_memory(self, word): if (self.name == 'wiki.fr') or (self.name == 'wiki.fr.bin'): # the pre-trained embeddings are not cased word = word.lower() if self.extension == 'bin': return self.model.get_numpy_vector(word) if word in self.model: return self.model[word] else: # for unknown word, we use a vector filled with 0.0 return np.zeros((self.static_embed_size, ), dtype=np.float32)
def __init__(self, n_layers, n_source_vocab, n_units, catgy, doc_catgy, senseid2netout, word2index, pre_trained_embedding, model_type, multi_label, wsd_epoch=0, h=8, dropout=0.1, max_length=500, use_label_smoothing=False, embed_position=False, wsd_model=None): super(Transformer, self).__init__() self.to_gpu() self.set_random_seed(123) self.word2index = word2index self.pre_trained_embedding = pre_trained_embedding self.model_type = model_type self.wsd_model = wsd_model self.multi_label = multi_label with self.init_scope(): if not self.pre_trained_embedding is None: model = FastText(self.pre_trained_embedding) dim = len(model['a']) n_vocab = len(self.word2index.keys()) self.look_up_table = self.xp.zeros((n_vocab, dim), dtype=np.float32) for word, index in self.word2index.items(): try: self.look_up_table[index] = chainer.cuda.to_gpu( model.get_numpy_vector(word)) except: self.xp.random.seed(index) self.look_up_table[index][:] = self.xp.random.uniform( -0.25, 0.25, dim) self.embed_x = L.EmbedID(n_source_vocab, n_units, ignore_label=-1, initialW=self.look_up_table) else: self.embed_x = L.EmbedID(n_source_vocab, n_units, ignore_label=-1, initialW=linear_init) self.encoder = Encoder(n_layers, n_units, h, dropout) self.fc2 = L.Linear(in_size=n_units, out_size=len(doc_catgy), initialW=linear_init) self.fc2_wsd = L.Linear(in_size=n_units, out_size=len(catgy), initialW=linear_init) self.lookup_table_sense = L.EmbedID(in_size=len(catgy), out_size=n_units, ignore_label=-1, initialW=linear_init) self.lookup_table_sense_fixed = self.lookup_table_sense.W.data self.senseid2netout = senseid2netout self.senseid2netout['<PAD>'] = [-1] self.wsd_epoch = wsd_epoch if embed_position: self.embed_pos = L.EmbedID(max_length, n_units, ignore_label=-1) self.n_layers = n_layers self.n_units = n_units self.dropout = dropout self.use_label_smoothing = use_label_smoothing self.initialize_position_encoding(max_length, n_units) self.scale_emb = self.n_units**0.5 ## origin 0.5 self.doc_catgy = doc_catgy self.catgy = catgy self.inverse_catgy = {v: k for k, v in self.catgy.items()} self.wsd_netout2wordindex = { k: self.word2index[v] for k, v in self.inverse_catgy.items() } self.wsd_netout2wordindex[-1] = -1 self.max_len = max_length self.le = None if self.multi_label == 1: self.le = MultiLabelBinarizer(classes=[ i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1]) ], sparse_output=False) elif self.multi_label == 0: self.le = LabelEncoder() self.le.fit([ i[0] for i in sorted(self.doc_catgy.items(), key=lambda x: x[1]) ]) self.to_gpu()
# 加载一个已经存在的模型 # model = FastText() # model.load_model('./path/to/model.bin') # 使用skip-gram模型训练 skip_gram_model = FastText() skip_gram_model.skipgram(input='./train.txt', output='skip_gram_model', epoch=100, lr=0.7) print(skip_gram_model['贷款']) # print(skip_gram_model.get_numpy_vector('贷款')) # print(skip_gram_model.get_numpy_vector('贷款', normalized=True)) var1 = skip_gram_model.get_numpy_vector('人民币') var2 = skip_gram_model.get_numpy_vector('贷款') var3 = skip_gram_model.get_numpy_vector('外币') skip_gram_model.words_for_vector(var1 + var2 - var3, k=1) # for word in skip_gram_model.words: # print(word, skip_gram_model[word]) print(skip_gram_model.nearest_neighbors('贷款', k=2)) # test data is stored inside a file, use this: # skip_gram_model.predict_proba_file('./test.txt', k=2) print("\n") ##################