Ejemplo n.º 1
0
def main():
	model = load_w2v(sys.argv[1])
	words = [w for w,_ in model.most_similar(positive=['country'], topn=20)]
	vecs = [model[w] for w in words]
	result = linkage(vecs, method='ward')
	dendrogram(result, labels=words, orientation='right')
	plt.show()
Ejemplo n.º 2
0
def main():
	model = load_w2v(sys.argv[1])
	with open(sys.argv[2]) as f:
		for line in f:
			if line[0] == ':':
				print(line.rstrip())
				continue
			w1, w2, w3, w4 = line.rstrip().split()
			word, prob = model.most_similar(positive=[w2, w3], negative=[w1], topn=1)[0]
			print(' '.join([w1, w2, w3, w4, word, str(prob)]))
Ejemplo n.º 3
0
def main():
    model = load_w2v(sys.argv[1])
    words = [w for w, _ in model.most_similar(positive=['country'], topn=20)]
    vecs = [model[w] for w in words]

    result = TSNE(n_components=2, random_state=0).fit_transform(vecs)

    plt.scatter(result[:, 0], result[:, 1])
    for point, word in zip(result, words):
        x, y = point
        plt.annotate(word, point)
    plt.show()
Ejemplo n.º 4
0
def main():
    model = load_w2v(sys.argv[1])
    words = [w for w, _ in model.most_similar(positive=['country'], topn=20)]
    vecs = [model[w] for w in words]

    labels = KMeans(n_clusters=5, random_state=0).fit_predict(vecs)
    result = [[], [], [], [], []]
    for word, label in zip(words, labels):
        result[label].append(word)

    for label in range(5):
        print('class {}:'.format(label))
        print(', '.join(result[label]) + '\n')
Ejemplo n.º 5
0
def main():
    model = load_w2v(sys.argv[1])
    most_similars = model.most_similar(positive=['Spain', 'Athens'],
                                       negative=['Madrid'])
    for word, prob in most_similars:
        print('{}\t{}'.format(word, prob))
Ejemplo n.º 6
0
from q63 import analogy


def load_question_words():
    data = pd.read_csv(
        project_path / "data/w2v/questions-words.txt",
        sep=" ",
        skiprows=1,
        header=None,
    )
    data = data[data.iloc[:, 0] != ":"]
    return data


if __name__ == "__main__":
    print("\rSolving Q64 ... ", end="")

    w2v = load_w2v()

    data = load_question_words()
    analogies = [
        analogy(w2v, r[1], r[0], r[2], topn=1)[0]
        for _, r in tqdm(list(data.iterrows()))
    ]
    data.insert(4, 4, [a[0] for a in analogies])
    data.insert(5, 5, [a[1] for a in analogies])

    data.to_csv(project_path / "output/w2v/analogy.csv")

    print("Done.")
Ejemplo n.º 7
0
def main():
    model = load_w2v(sys.argv[1])
    most_similars = model.most_similar('United_States', topn=10)
    for word, prob in most_similars:
        print('{}\t{}'.format(word, prob))
Ejemplo n.º 8
0
def main():
	model = load_w2v(sys.argv[1])
	similarity = model.similarity('United_States', 'U.S.')
	print(similarity)
Ejemplo n.º 9
0
Archivo: q70.py Proyecto: simaki/nlp100
 def __init__(self, min_freq=1, root=project_path / "data/news/"):
     w2v = load_w2v()
     self.w2v = w2v
     self.min_freq = min_freq
     self.root = root
     self.padding_idx = 0