Beispiel #1
0
           4 : "four",
           5 : "five",
           6 : "six"}

colors = {  0   :   "blue",
            1   :   "orange",
            2   :   "green",
            3   :   "yellow",
            4   :   "red"
            }


app = Flask(__name__)
app.DEBUG = True
#model = word2vec("../Data/vector_nyt-0.1.txt", "../Data/cluster_nyt-0.1.txt")
model = word2vec("./../Data/vector_rmrb-0.1.txt", "../Data/cluster_rmrb-0.1.txt", "./../Data/freq_demo.txt")
#model = word2vec("./../NewData/rmrb.vector-0.1.txt","./../NewData/rmrb.vector-0.1.txt","./../Data/freq_demo.txt")
prob = Prob("./../Data/rmrb.prob.afterextend")
#prob = Prob("./../NewData/rmrb.prob.all")

def search_result_oneword(word):
    global model
    return model.compute_kNN(word)

def shutdown_server():
    func = request.environ.get("werkzeug.server.shutdown")
    if func is None:
        raise RuntimeError('Not running with the Werkzeug Server')
    func()

@app.route("/")
Beispiel #2
0
    print(i, len(data))
""" Here we sort words per their numbers of appearances """
words = pd.DataFrame({
    'words': list(words.keys()),
    'counts': list(words.values())
})
words = words.sort_values('counts', ascending=False).reset_index(drop=True)
""" Take first 10000 the most frequent words """
actual_words = words['words'][:10000]
actual_words = {actual_words[i]: i for i in range(len(actual_words))}
""" Training embeddings """
"""
The embedding small net is based on  
https://towardsdatascience.com/word2vec-skip-gram-model-part-1-intuition-78614e4d6e0b
"""
proc2vec = word2vec(10000)
opt = keras.optimizers.Adam()
proc2vec.compile(optimizer=opt, loss='categorical_crossentropy')
proc2vec.summary()


def onehot(pos, len_actual=10000):
    ans = np.zeros((len_actual, 1))
    ans[pos, 0] = 1
    return ans


def train_append(s):
    train = []
    positions = {str(i): 1 for i in range(len(s))}
    for j0, j in enumerate(s):
Beispiel #3
0
# -*- coding: utf-8 -*-
"""
Created on Sun Aug 19 14:03:12 2018
@author: Moc
"""
from model import bow
from model import tfidf
from model import word2vec
from dataset import load_data
from dataset import load_w2v


if __name__ == '__main__':
    # 加载数据
    X_train, X_test, y_train, y_test = load_data()
    # 模型测试
    bow_clf=bow(X_train,X_test,y_train,y_test)
    tfidf_clf=tfidf(X_train,X_test,y_train,y_test)

    X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec=load_w2v()
    word2vec_clf=word2vec(X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec)  #不准
Beispiel #4
0
filename = "medium_text.txt"
# filename = 'tsts.txt'
print("Parsing text and loading training data...")
vocab, word_to_ix, ix_to_word, training_data = load_data(filename,
                                                         CONTEXT_SIZE,
                                                         model_type="skipgram",
                                                         subsampling=True,
                                                         sampling_rate=0.001)
print('len(training_data): ', len(training_data), '\n')

losses = []
loss_function = nn.NLLLoss()

# model = SkipGram(len(vocab), EMBEDDING_DIM)
model = word2vec(len(word_to_ix), EMBEDDING_DIM)
# optimizer = optim.SGD(model.parameters(), lr=0.001)
optimizer = optim.SGD(model.parameters(), lr=0.008, momentum=0.9)
# print(model, '\n')
# print(optimizer, '\n')
# exit()
batch_size = 500
print("Starting training")
for epoch in tqdm(range(NUM_EPOCHS)):
    # total_loss = torch.Tensor([0])
    print("Beginning epoch %d/%d" % (epoch, NUM_EPOCHS))

    # '''
    training_data = np.array(training_data)
    #print(type(training_data), training_data.shape)
    # print(training_data.shape)
Beispiel #5
0
 def build_model(self):
     self.model = word2vec(self.num_emb, self.emb_size)
     self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.lr)
     self.model.to(self.device)
Beispiel #6
0
    print(index1)
    print(index2)

    emb1 = [emb[index1]]
    emb2 = [emb[index2]]

    distance = cdist(emb1, emb2, dist_type)

    return distance

config = get_config()
data = data_loader(config)
num_emb, _, _, _, _ = data.preprocess()

model = word2vec(num_emb, config.emb_size)
model.load_state_dict(torch.load("./test.pt"))

with open('./vocab_dict.txt','r') as f:
    vocab_dict = json.loads(f.read())
    print(type(vocab_dict))
print(vocab_dict)


params = []
filename = "parameters.txt"

# write embedded words
with open("parameters.txt", "w") as f:
    for param in model.parameters():
        for thing in param:
Beispiel #7
0
                          keras.callbacks.ModelCheckpoint(
                              w2v_model_path,
                              monitor='loss',
                              verbose=0,
                              save_best_only=True,
                              save_weights_only=True,
                              mode='auto',
                              period=5),
                          keras.callbacks.EarlyStopping(monitor='loss',
                                                        patience=2,
                                                        verbose=0,
                                                        mode='auto',
                                                        min_delta=0.01)
                      ])

    w2v_model = model.word2vec(vocab_len, vec_size=100)
    w2v_model.load_weights(w2v_model_path, by_name=True)

    p = w2v_model.predict(
        x=numpy.array([vocabulary.word_index(vocab, "great")]))

    distances = []

    for word in vocab["word"]:
        p_w = w2v_model.predict(
            x=numpy.array([vocabulary.word_index(vocab, word)]))
        dist = numpy.linalg.norm(p.flatten().reshape((100, 1)) -
                                 p_w.flatten().reshape((100, 1)))
        distances.append(("great", word, dist))
    sorted_distances = sorted(distances, key=lambda w: w[2])
    print(sorted_distances[:20])