def info_dataset(dev=NA): dev = nav(dev, DEV) n, pos, neg = Path('so.pair').file().load() sim = Path('so.sim').file().load().to(dev) rs = [] for pair in [pos, neg]: pair = TensData(pair, 1, s=TRUE)(tens0((n, n), dt=DTB, dev=dev)) r = [] for i in range(n): t = sim[i][pair[i]] a = t.min().tolist() m = t.mean().tolist() b = t.max().tolist() r.append([a, m, b]) pr(i, a, m, b) t = tens(r, dt=DTR, dev=dev) ma = t[:, 0].min().tolist() a = t[:, 0].mean().tolist() m = t[:, 1].mean().tolist() b = t[:, 2].mean().tolist() mb = t[:, 2].max().tolist() t = [ma, a, m, b, mb] pl(*t) rs.append(t) pr.params({'exts': '10'})(rs)
def draw_qxl(x, y): before_draw() t = tens([x, y], dt=DTR).view(1, -1) r = (tense(t.numel(), dt=DTR) - t.t().matmul(t)).tolist() t = t.squeeze().tolist() plt.arrow(0, 0, *t, length_includes_head=TRUE, head_width=0.1, head_length=0.1, ec='b', fc='b') for i in r: plt.arrow(*t, *i, length_includes_head=TRUE, head_width=0.1, head_length=0.1, ec='r', fc='r') dom = 5 plt.xlim(-dom, dom) plt.ylim(-dom / 2, dom / 2) after_draw() plt.show()
def proc_km(k, v): v = tens(v, dt=DTR, dev=dev) ms = v.mean(dim=0).mul(100).tolist() cs = v.std(dim=0).mul(100).tolist() ms = [round(i, 4) for i in ms] cs = [round(i, 4) for i in cs] s = dict(zip(k, zip(ms, cs))) return s
def draw_sj(k=NA, t=NA, n=NA): k = nav(k, [0.1, 0.3, 0.5, 0.7, 0.9]) t = nav(t, 100) n = nav(n, 1000) before_draw() l = len(k) k = tens(k, dt=DTR) a = k**(1 / t) x = tensa(n, dt=DTR) y = a.view(-1, 1)**x for i in range(l): plt.plot(x.numpy(), y[i].numpy(), label=f'{k[i].tolist():.1f}:{a[i].tolist():.4f}') plt.legend() after_draw() plt.show()
def draw_bkm(fn): from matplotlib import pyplot as plt plt.rcParams['figure.figsize'] = (40, 15) plt.rcParams['figure.dpi'] = 100 plt.rcParams['savefig.dpi'] = 100 r = {} for i in get_lines(File(fn)): k, v = i.split(':', 1) dn, sn, pn, _ = k.split('-') sn = f'{sn}-{pn}' v = eval(v) if dn not in r: r[dn] = {} for c in v: if c not in r[dn]: r[dn][c] = {} if sn not in r[dn][c]: r[dn][c][sn] = [] t = v[c] if isTuple(t): t = t[0] r[dn][c][sn].append(t) i = 0 for dn in r: for c in r[dn]: plt.subplot(241 + i) for sn in r[dn][c]: t = r[dn][c][sn] x = (tensa(len(t), dt=DTI) + 1).numpy() y = tens(t, dt=DTR).numpy() plt.plot(x, y, label=sn) plt.title(f'{dn}-{c}') plt.legend() plt.grid() i += 1 bj = 0.05 jj = 0.2 plt.subplots_adjust(left=bj, right=1 - bj, bottom=bj, top=1 - bj, wspace=jj, hspace=jj) plt.savefig(f'{fn}.jpg') plt.clf() plt.cla()
def get_wvs(): import numpy as np def load_word_vec_skipgram(wid): file = File('data/wv/GoogleNews-vectors-negative300.bin') dt = np.dtype('float32') with file.open('rb') as f: header = f.readline().decode() vocab_size, vec_size = map(int, header.split()) # 300_0000, 300 yield vocab_size, vec_size vec_len = vec_size * dt.itemsize # 1200 for line in range(vocab_size): word = [] while TRUE: b = f.read(1) if b == b' ': break elif b != b'\n': word.append(b) word = b''.join(word).decode() vec = f.read(vec_len) if word in wid: yield wid[word], np.frombuffer(vec, dtype=dt) for dn in dns: print(f'dn: {dn}') xpp = File(f'data/{dn}-xpp').load() wid = TFIDF(xpp).wid print(f'size_vocab: {len(wid)}') load_word_vec = load_word_vec_skipgram(wid) _, vec_size = next(load_word_vec) wvs = { w: tens(v, dt=DTR, dev=dev).view(1, -1) for w, v in load_word_vec } print(f'size_vocab_wv: {len(wvs)}') wvs = [(wvs[i] if i in wvs else tens0((1, vec_size), dt=DTR, dev=dev)) for i in range(len(wid))] wvs = tc.cat(wvs, dim=0) File(f'{dn}-wvs').store(wvs)
def run_bkm_proposed(fn): def do_train_cos(self): ids = self.batch[0] r = self.model(*self.batch) r = tens_unit(r, dim=1) r = r.matmul(r.t()) s = self.sup() ids = ids.to(s.device) s = s[ids][:, ids] s = s.to(self.dev) loss = ((r - s)**2).mean() loss = loss * self.weight() return loss do_train = do_train_cos get_kmeans = KMeansCos init_rand(88888888) rb = RunBert() rb.init_path() rb.init_dev(dev) file = File(f'res/{fn}') for dn in dns: if dn == 'so': continue y = File(f'data/{dn}-y').load() x = File(f'data/{dn}-x').load() rb.init_data((x, y)) for sn in sns: if sn in ['tfidf', 'skipgram']: if dn == 'gs' and sn == 'tfidf': continue if dn == 'bm' and sn == 'skipgram': continue sim = File(f'data/{dn}-sim-{sn}').load() sim_size = sim.shape[0] rb.sim = sim del sim for pn in range(3): if pn in [0, 1]: continue if pn in [0, 1]: eig = File(f'data/{dn}-sim-{sn}-eig').load() eig = tens(eig, dt=DTR, dev=dev) eigk = (1, 0.1)[pn] ids = tensa(len(y), dt=DTI, dev=dev) ids = ids[eig >= eigk].tolist() print(f'len(ids)={len(ids)}, eigk={eigk}') del eig, eigk else: ids = NA model_name = f'model-{dn}-{sn}-{pn}' rb.init_model() iters = 100 iter_count = 150 def get_sampler(dataset): sp_u = SamplerPair(dataset, 0, sim_size, ids=ids, nks=10) sp_u = SamplerPairn(dataset, sp_u, iters, iter_count) return sp_u rb.init_sampler_eval() rb.init_sampler(get_sampler) rb.init_optimizer() for i in range(iters): rb.train(do_train) epoch_loss = rb.epoch_loss rb.clear_train() rb.init_repr() rb.eval() rb.clear_eval() km = rb.kmeans(get_kmeans) rb.clear_repr() k, v = run_km(km) del km s = proc_km(k, v) s['loss'] = round(epoch_loss, 4) t = f'{dn}-{sn}-{pn}-{i+1}: {s}' file.writea(t + '\n') print(f'{TimeDate().valstr()}: {t}') rb.clear_optimizer() rb.clear_sampler() rb.clear_sampler_eval() rb.clear_model() file.writea('\n') print('=' * 64) rb.sim = NA rb.clear_data()
epoch += 1 if epoch % 100 == 0: pr() if epoch > epochs: break if tl < 0.01: nconv += 1 else: nconv = 0 if nconv >= 10: break pl() model = model.cpu() with tc.no_grad(): for k, v in model.named_parameters(): pr(k, v.squeeze().tolist()) pl() while TRUE: s = pris() try: tx = tens([[real(s)]], dt=DTR) ty = func(tx) r = model(tx) l = loss(r, ty).tolist() r = r.squeeze().tolist() ty = ty.squeeze().tolist() pr(y=r, Y=ty, loss=l) except Exception as e: pr(e) s = pris() lr, epochs = [real(i) for i in s.split()] break exit()