4 : "four", 5 : "five", 6 : "six"} colors = { 0 : "blue", 1 : "orange", 2 : "green", 3 : "yellow", 4 : "red" } app = Flask(__name__) app.DEBUG = True #model = word2vec("../Data/vector_nyt-0.1.txt", "../Data/cluster_nyt-0.1.txt") model = word2vec("./../Data/vector_rmrb-0.1.txt", "../Data/cluster_rmrb-0.1.txt", "./../Data/freq_demo.txt") #model = word2vec("./../NewData/rmrb.vector-0.1.txt","./../NewData/rmrb.vector-0.1.txt","./../Data/freq_demo.txt") prob = Prob("./../Data/rmrb.prob.afterextend") #prob = Prob("./../NewData/rmrb.prob.all") def search_result_oneword(word): global model return model.compute_kNN(word) def shutdown_server(): func = request.environ.get("werkzeug.server.shutdown") if func is None: raise RuntimeError('Not running with the Werkzeug Server') func() @app.route("/")
print(i, len(data)) """ Here we sort words per their numbers of appearances """ words = pd.DataFrame({ 'words': list(words.keys()), 'counts': list(words.values()) }) words = words.sort_values('counts', ascending=False).reset_index(drop=True) """ Take first 10000 the most frequent words """ actual_words = words['words'][:10000] actual_words = {actual_words[i]: i for i in range(len(actual_words))} """ Training embeddings """ """ The embedding small net is based on https://towardsdatascience.com/word2vec-skip-gram-model-part-1-intuition-78614e4d6e0b """ proc2vec = word2vec(10000) opt = keras.optimizers.Adam() proc2vec.compile(optimizer=opt, loss='categorical_crossentropy') proc2vec.summary() def onehot(pos, len_actual=10000): ans = np.zeros((len_actual, 1)) ans[pos, 0] = 1 return ans def train_append(s): train = [] positions = {str(i): 1 for i in range(len(s))} for j0, j in enumerate(s):
# -*- coding: utf-8 -*- """ Created on Sun Aug 19 14:03:12 2018 @author: Moc """ from model import bow from model import tfidf from model import word2vec from dataset import load_data from dataset import load_w2v if __name__ == '__main__': # 加载数据 X_train, X_test, y_train, y_test = load_data() # 模型测试 bow_clf=bow(X_train,X_test,y_train,y_test) tfidf_clf=tfidf(X_train,X_test,y_train,y_test) X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec=load_w2v() word2vec_clf=word2vec(X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec) #不准
filename = "medium_text.txt" # filename = 'tsts.txt' print("Parsing text and loading training data...") vocab, word_to_ix, ix_to_word, training_data = load_data(filename, CONTEXT_SIZE, model_type="skipgram", subsampling=True, sampling_rate=0.001) print('len(training_data): ', len(training_data), '\n') losses = [] loss_function = nn.NLLLoss() # model = SkipGram(len(vocab), EMBEDDING_DIM) model = word2vec(len(word_to_ix), EMBEDDING_DIM) # optimizer = optim.SGD(model.parameters(), lr=0.001) optimizer = optim.SGD(model.parameters(), lr=0.008, momentum=0.9) # print(model, '\n') # print(optimizer, '\n') # exit() batch_size = 500 print("Starting training") for epoch in tqdm(range(NUM_EPOCHS)): # total_loss = torch.Tensor([0]) print("Beginning epoch %d/%d" % (epoch, NUM_EPOCHS)) # ''' training_data = np.array(training_data) #print(type(training_data), training_data.shape) # print(training_data.shape)
def build_model(self): self.model = word2vec(self.num_emb, self.emb_size) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr) self.model.to(self.device)
print(index1) print(index2) emb1 = [emb[index1]] emb2 = [emb[index2]] distance = cdist(emb1, emb2, dist_type) return distance config = get_config() data = data_loader(config) num_emb, _, _, _, _ = data.preprocess() model = word2vec(num_emb, config.emb_size) model.load_state_dict(torch.load("./test.pt")) with open('./vocab_dict.txt','r') as f: vocab_dict = json.loads(f.read()) print(type(vocab_dict)) print(vocab_dict) params = [] filename = "parameters.txt" # write embedded words with open("parameters.txt", "w") as f: for param in model.parameters(): for thing in param:
keras.callbacks.ModelCheckpoint( w2v_model_path, monitor='loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=5), keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=0, mode='auto', min_delta=0.01) ]) w2v_model = model.word2vec(vocab_len, vec_size=100) w2v_model.load_weights(w2v_model_path, by_name=True) p = w2v_model.predict( x=numpy.array([vocabulary.word_index(vocab, "great")])) distances = [] for word in vocab["word"]: p_w = w2v_model.predict( x=numpy.array([vocabulary.word_index(vocab, word)])) dist = numpy.linalg.norm(p.flatten().reshape((100, 1)) - p_w.flatten().reshape((100, 1))) distances.append(("great", word, dist)) sorted_distances = sorted(distances, key=lambda w: w[2]) print(sorted_distances[:20])