def plot_embedding_dimension(emb, dim, charpoints): plt.figure(figsize=(20,4)) plt.axis([ np.min(emb[charpoints][:,dim]), np.max(emb[charpoints][:,dim]), .3, .7 ]) plt.yticks([]) plt.ylabel('') plt.title('Dimension {}'.format(dim+1)) texts = [] for cp in charpoints: #y = random.random() y = .4 + random.random() * .2 t = plt.text(emb[cp][dim], y, charify(cp) ) color = char_color(chr(cp)) t.set_bbox(dict(color=color, alpha=.5, boxstyle='round')) texts.append(t) adjust_text(texts, only_move={'text': 'y'}, force_text=14.5, expand_points=(1.2, 1.2), lim=5000, ) fname = 'vis/d{}{}.png'.format( ALLOWED_TYPES[0] if len(ALLOWED_TYPES) == 1 else '', dim) plt.savefig(fname, bbox_inches='tight') plt.clf()
def analogy(a, b, j, k=3, targ=None): """a is to b, as j is to ___ """ a, b, j = ord(a), ord(b), ord(j) global embedding, nbrs, charpoints e = embedding answer = e[j] + (e[b] - e[a]) dist, idxs = nbrs.kneighbors([answer], k) for d,i in zip(dist.ravel(), idxs.ravel()): print "{}:{:.1f}".format(charify(charpoints[i]), d) if targ: target_vector = e[ord(targ)] print "Distance to {}: {:.1f}".format(targ, scipy.spatial.distance.euclidean(target_vector, answer))
def pprint_char(c): if ord(c) == VOCAB['bow']: return '^' elif ord(c) == VOCAB['eow']: return '$' elif ord(c) == VOCAB['pad']: return '_' elif ord(c) == VOCAB['bos']: return '<BOS>' elif ord(c) == VOCAB['eos']: return '<EOS>' elif c in '^$_': return '\\' + c else: return common.charify(c)
def nn(vec): dist, idxs = nbrs.kneighbors([vec], 3) for d,i in zip(dist.ravel(), idxs.ravel()): print "{}:{:.1f}".format(charify(charpoints[i]), d)
return scipy.spatial.distance.euclidean(a, b) def nn(vec): dist, idxs = nbrs.kneighbors([vec], 3) for d,i in zip(dist.ravel(), idxs.ravel()): print "{}:{:.1f}".format(charify(charpoints[i]), d) def an(abj, k=3, target=None): return analogy(*abj, k=k, targ=target) #ALLOWED_TYPES = ['digit', 'uppercase', 'lowercase', 'meta', 'punctuation'] ALLOWED_TYPES = ['uppercase', 'lowercase'] charpoints = [i for i in range(128) if char_type(i) in ALLOWED_TYPES] embedding = get_embedding() NUM = {} for i in range(10): NUM[i] = embedding[ord(str(i))] X = embedding[charpoints] nbrs = NearestNeighbors(n_neighbors=N_NEIGHBS+1, algorithm='brute').fit(X) distances, indices = nbrs.kneighbors(X) for i, cp in enumerate(charpoints): print charify(cp) + '\t', for i2, ddist in zip(indices[i], distances[i])[1:]: # skip self-matches print '{}:{:.1f} '.format(charify(charpoints[i2]), ddist), print
x_max[0] + x_pad * 2, x_min[1] - y_pad, x_max[1] + y_pad * 2, ]) plt.yticks([]) plt.xticks([]) texts = [] xs = [] ys = [] for i, charpoint in enumerate(charpoints): if HIDE_OTHER_TYPES and char_type(charpoint) not in ALLOWED_TYPES: continue pt = X_sne[i] if TEXT_MODE: char = charify(charpoint) xs.append(pt[0]) ys.append(pt[1]) t = plt.text( pt[0], pt[1], char, fontdict={'size': 14}, ha="center", va="center", ) if COLORIZE: color = char_color(chr(charpoint)) t.set_bbox( dict( color=color,
import sys import string import common try: fname = sys.argv[1] except IndexError: fname = '../data/news.txt' byte_counts = [0 for _ in range(256)] with open(fname) as f: b = f.read(1) while b != "": byte_counts[ord(b)] += 1 b = f.read(1) print '\n'.join('{}\t{}\t{}'.format(i, byte_counts[i], common.charify(i)) for i in range(256))