def plot_decomposition(args): print(f'Reading vectors from `{args.vec_path}`...') embeddings, w2i, i2w = load_vectors(args.vec_path, gensim=args.gensim_format) matrix_path = os.path.join(args.matrix_dir, f'{args.name}') logX = load_matrix(matrix_path + '.logx.npz') fX = load_matrix(matrix_path + '.fx.npz') logX, fX = logX.todense(), fX.todense() plt.imshow(embeddings) plt.savefig(os.path.join('plots', 'emb.pdf')) plt.clf() plt.imshow(embeddings.T) plt.savefig(os.path.join('plots', 'emb.t.pdf')) plt.clf() plt.imshow(logX) plt.savefig(os.path.join('plots', 'logX.pdf')) plt.clf() plt.imshow(fX * logX) plt.savefig(os.path.join('plots', 'fX.logX.pdf')) plt.clf() plt.imshow(embeddings @ embeddings.T) plt.savefig(os.path.join('plots', 'logX_.pdf')) plt.clf()
def load_data(rank): # Loading co-occurrence data. if rank == 0: print('Loading co-occurrence matrices...') logX = load_matrix(matrix_path + '.logx.npz') if rank == 0: print('Loaded logX.') fX = load_matrix(matrix_path + '.fx.npz') if rank == 0: print('Loaded fX.') type = 'sparse' if sparse else 'dense' if rank == 0: print(f'Using {type} cooccurence matrices during training.') if not sparse: logX = logX.todense() fX = fX.todense() return logX, fX
def real_matrix (): matrix_path = "./pickle/matrix/Abby_Watkins.matrix" if not os.path.exists(matrix_path): print "./pickle/matrix/Abby_Watkins.matrix ---- not exits" os.exits() matrix = load_matrix(matrix_path) return matrix
def load_graph(self, path): matrix = util.load_matrix(path) G = nx.Graph() length = len(matrix) for i in range(length): edges_list = [(i, j, matrix[i][j]) for j in range(i+1, length)] G.add_weighted_edges_from(edges_list) self.G = G return None
def load_graph(self, path): matrix = util.load_matrix(path) G = nx.Graph() length = len(matrix) for i in range(length): edges_list = [(i, j, matrix[i][j]) for j in range(i + 1, length)] G.add_weighted_edges_from(edges_list) self.G = G return None
def run(matrix_dir, svd_matrix_dir): if not os.path.exists(svd_matrix_dir): os.makedirs(svd_matrix_dir) for file_name in os.listdir(matrix_dir): name = file_name.split('.')[0] matrix_path = os.path.join(matrix_dir, file_name) matrix = util.load_matrix(matrix_path) svd_matrix = get_topicMatrix(np.array(matrix), len(matrix)) svd_path = os.path.join(svd_matrix_dir, '%s.matrix' % name) util.dump_matrix(svd_matrix, svd_path) return None
def main(): grid = [[]] grid[0].append(util.load_matrix("input.txt")) for i in range(6): print('.') nxt = step2(grid) if h(nxt) == h(grid): break grid = nxt print(h(grid).count("#"))
def main(): grid = [] grid.append(util.load_matrix("input.txt")) #while True: for i in range(6): nxt = step2(grid) if h(nxt) == h(grid): break grid = nxt print(h(grid).count("#"))
def run(matrix_dir, cosine_dir, similarity_method): util.makedir(cosine_dir) count = 0 for file_name in os.listdir(matrix_dir): name = file_name.split('.')[0] count += 1 util.write('begin %s: %s' % (count, name)) file_path = os.path.join(matrix_dir, file_name) matrix = util.load_matrix(file_path) # sim_matrix = cosine(matrix) sim_matrix = compute_similarity(matrix, similarity_method) cosine_path = os.path.join(cosine_dir, '%s.matrix' % name) util.dump_matrix(sim_matrix, cosine_path) return None
def run(matrix_dir, category_dir): if not os.path.exists(category_dir): os.makedirs(category_dir) clusterer = GAAClusterer() count = 0 for file_name in os.listdir(matrix_dir): name = file_name.split('.')[0] count += 1 print 'begin %s: %s' % (count, name) file_path = os.path.join(matrix_dir, file_name) matrix = util.load_matrix(file_path) np_matrix = [np.array(row) for row in matrix] print np_matrix result = clusterer.cluster(np_matrix, False, "euc", "mean") category_path = os.path.join(category_dir, '%s.pickle' % name) with open(category_path, 'wb') as fp: pickle.dump(result, fp) return None
def run(matrix_dir, category_dir): if not os.path.exists(category_dir): os.makedirs(category_dir) clusterer = AD_Cluster() count = 0 for file_name in os.listdir(matrix_dir): name = file_name.split('.')[0] count += 1 print 'begin %s: %s' % (count, name) file_path = os.path.join(matrix_dir, file_name) matrix = util.load_matrix(file_path) size = len(matrix) result = [(9999,)]*size # 9999 号类表示 discard, 初始默认所有样本为 discard 状态 realPTs, noisePTs,tmp,angles,real,noise= cutNoise(matrix) print "len of real",len(real) print "len of realPTs",len(realPTs) # --------------------噪声自成一类------------------------- cNum = 0 # 记录已使用类标签数量 for i in noise: result[i] = (cNum,) cNum += 1 # ------------------------------------------------------- np_matrix = [np.array(row) for row in realPTs] print np_matrix result0 = clusterer.cluster(np_matrix, False,None, "euc", "mean") print "len of result0",len(result0) print "len of result",len(result) # ---------------- 聚类结果标签加上原来已用的类标数 ---------- for i, c in enumerate(result0): result[real[i]] = (c[0] + cNum,) category_path = os.path.join(category_dir, '%s.pickle' % name) with open(category_path, 'wb') as fp: pickle.dump(result, fp) # if count > 5: # break return None
def run(matrix_dir, category_dir): if not os.path.exists(category_dir): os.makedirs(category_dir) clusterer = AD_Cluster() count = 0 for file_name in os.listdir(matrix_dir): name = file_name.split('.')[0] count += 1 print 'begin %s: %s' % (count, name) file_path = os.path.join(matrix_dir, file_name) matrix = util.load_matrix(file_path) size = len(matrix) result = [(9999, )] * size # 9999 号类表示 discard, 初始默认所有样本为 discard 状态 realPTs, noisePTs, tmp, angles, real, noise = cutNoise(matrix) print "len of real", len(real) print "len of realPTs", len(realPTs) # --------------------噪声自成一类------------------------- cNum = 0 # 记录已使用类标签数量 for i in noise: result[i] = (cNum, ) cNum += 1 # ------------------------------------------------------- np_matrix = [np.array(row) for row in realPTs] print np_matrix result0 = clusterer.cluster(np_matrix, False, None, "euc", "mean") print "len of result0", len(result0) print "len of result", len(result) # ---------------- 聚类结果标签加上原来已用的类标数 ---------- for i, c in enumerate(result0): result[real[i]] = (c[0] + cNum, ) category_path = os.path.join(category_dir, '%s.pickle' % name) with open(category_path, 'wb') as fp: pickle.dump(result, fp) # if count > 5: # break return None
def demo(): from util import load_matrix,draw_line,draw_2D_noise points = load_matrix("./pickle/matrix/Alice_Gilbreath.matrix") # points = noise_2D_vector_rect(200, 3) # points.append((0,0)) # points.append((0,0)) vectors = [list(e) for e in points] realPTs, noisePTs,tmp,angles,real,noise = cutNoise(vectors) print "realPTs",len(realPTs),"real",len(real) print "noisePTs,",len(noisePTs),"noise",len(noise) print tmp print angles draw_line(tmp) draw_line(angles) draw_2D_noise(realPTs, noisePTs) cluster1 = AD_Cluster() # vectors0 = [np.array(f) for f in points] vectors1 = [np.array(f) for f in realPTs]
def main(args): if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print('Device:', device) # Set seed for reproducibility. torch.manual_seed(args.seed) np.random.seed(args.seed) # Construct paths. vocab_path = os.path.join(args.vocab_dir, f'{args.name}.vocab') matrix_path = os.path.join(args.matrix_dir, f'{args.name}') model_path = os.path.join(args.model_dir, f'{args.name}.model.dict') log_path = os.path.join(args.log_dir, 'losses.csv') out_path = os.path.join(args.out_dir, args.name) # Load vocabulary. w2i, i2w = load_vocabulary(vocab_path) vocab_size = len(i2w) print(f'Loaded vocabulary of size {vocab_size}.') sparse = bool(vocab_size > 20000) # Load co-occurrence data. print('Loading co-occurrence matrices...') logX = load_matrix(matrix_path + '.logx.npz') print('Loaded logX.') fX = load_matrix(matrix_path + '.fx.npz') print('Loaded fX.') type = 'sparse' if sparse else 'dense' print(f'Using {type} cooccurence matrices during training.') if not sparse: logX = logX.todense() fX = fX.todense() # Construct model and optimizer. model = GloVe(vocab_size=vocab_size, emb_dim=args.emb_dim, sparse=True).to(device) optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr) print('Training...') losses = [] logs = [('step', 'loss')] lr = args.lr epoch = 0 prev_loss = np.inf t0 = time.time() try: for step in range(1, args.num_updates + 1): # Sample a random batch. idx = np.random.randint(0, high=vocab_size, size=(args.batch_size, )) indices = torch.LongTensor(idx).to(device) submat = np.ix_(idx, idx) # used to select the submatrix if sparse: logx = logX[submat].todense() weights = fX[submat].todense() else: logx = logX[submat] weights = fX[submat] logx = torch.FloatTensor(logx).to(device) weights = torch.FloatTensor(weights).to(device) # Forward pass loss = model(indices, logx, weights) del indices, logx, weights # free some memory # Update parameters optimizer.zero_grad() loss.backward() optimizer.step() # Bookkeeping losses.append(loss.item()) if step % args.print_every == 0: ls = losses[-args.print_every:] avg_loss = sum(ls) / args.print_every logs.append((step, avg_loss)) print( '| epoch {:4d} | step {:6d} | loss {:.4f} | pairs/sec {:.1f} | lr {:.1e}' .format( epoch, step, avg_loss, args.print_every * args.batch_size / (time.time() - t0), lr)) t0 = time.time() if args.use_schedule: if avg_loss >= prev_loss: lr /= 4.0 optimizer = torch.optim.SparseAdam(model.parameters(), lr=lr) prev_loss = avg_loss if step % args.save_every == 0: with open(log_path, 'w') as f: writer = csv.writer(f) writer.writerows(logs) k, _ = divmod(step * args.batch_size, vocab_size) if k > epoch: epoch = k except KeyboardInterrupt: print('-' * 89) print('Exiting from training early') # Get the learned embeddings from the model. embeddings = model.embedding.weight.data.cpu().numpy() if args.gensim_format: out_path += f'.{args.emb_dim}d.gensim.txt' else: out_path += f'.{args.emb_dim}d.txt' print(f'Writing vectors to `{out_path}`...') write_vectors(embeddings, out_path, args.emb_dim, i2w, gensim=True)
def main(args): """create word vector :param file_path: path of corpus :param window_size: window size :param shift: num of samples in w2v skip-gram negative-sampling(sgns) :param dim: the size of wordvec WV = [vocab_size, dim] """ logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO) logging.info(f"[INFO] args: {args}") logging.info("[INFO] Loading dictionary...") id_to_word, word_to_id = load_pickle(args.pickle_id2word) vocab_size = len(id_to_word) logging.debug(f"[DEBUG] vocab: {vocab_size} words") if args.cooccur_pretrained is not None: logging.info("[INFO] Loading pre-trained co-occur matrix...") C = load_matrix(args.cooccur_pretrained, len(id_to_word)) else: logging.info("[INFO] Creating co-occur matrix...") C = create_co_matrix(args.file_path, word_to_id, vocab_size, args.window_size) # threshold by min_count if args.threshold: C = threshold_cooccur(C, threshold=args.threshold) os.makedirs("model", exist_ok=True) c_name = "model/C_w-{}".format(args.window_size) with open(c_name, "w") as wp: for id, cooccur_each in enumerate(C): cooccur_nonzero = [ f"{id}:{c}" for id, c in enumerate(cooccur_each) if c > 0 ] wp.write(f"{id}\t{' '.join(cooccur_nonzero)}\n") if args.sppmi_pretrained is not None: logging.info("[INFO] Loading pre-trained sppmi matrix...") M = load_matrix(args.sppmi_pretrained, len(id_to_word)) else: logging.info("[INFO] Computing sppmi matrix...") # use smoothing or not in computing sppmi M = sppmi(C, args.shift, has_abs_dis=args.has_abs_dis, has_cds=args.has_cds) m_name = "model/SPPMI_w-{}_s-{}".format(args.window_size, args.shift) with open(m_name, "w") as wp: for id, sppmi_each in enumerate(M): sppmi_nonzero = [ f"{id}:{m}" for id, m in enumerate(sppmi_each) if m > 0 ] wp.write(f"{id}\t{' '.join(sppmi_nonzero)}\n") logging.info("[INFO] Calculating word vector...") try: from scipy.sparse.linalg import svds U, S, V = svds(coo_matrix(M), k=args.dim) except: U, S, V = np.linalg.svd(coo_matrix(M)) word_vec = np.dot(U, np.sqrt(np.diag(S))) wv_name = "model/WV_d-{}_w-{}_s-{}".format(args.dim, args.window_size, args.shift) np.save(wv_name, word_vec[:, :args.dim]) return