Esempio n. 1
0
def plot_decomposition(args):
    print(f'Reading vectors from `{args.vec_path}`...')
    embeddings, w2i, i2w = load_vectors(args.vec_path, gensim=args.gensim_format)

    matrix_path = os.path.join(args.matrix_dir, f'{args.name}')
    logX = load_matrix(matrix_path + '.logx.npz')
    fX = load_matrix(matrix_path + '.fx.npz')
    logX, fX = logX.todense(), fX.todense()

    plt.imshow(embeddings)
    plt.savefig(os.path.join('plots', 'emb.pdf'))
    plt.clf()

    plt.imshow(embeddings.T)
    plt.savefig(os.path.join('plots', 'emb.t.pdf'))
    plt.clf()

    plt.imshow(logX)
    plt.savefig(os.path.join('plots', 'logX.pdf'))
    plt.clf()

    plt.imshow(fX * logX)
    plt.savefig(os.path.join('plots', 'fX.logX.pdf'))
    plt.clf()

    plt.imshow(embeddings @ embeddings.T)
    plt.savefig(os.path.join('plots', 'logX_.pdf'))
    plt.clf()
Esempio n. 2
0
    def load_data(rank):
        # Loading co-occurrence data.
        if rank == 0: print('Loading co-occurrence matrices...')
        logX = load_matrix(matrix_path + '.logx.npz')
        if rank == 0: print('Loaded logX.')
        fX = load_matrix(matrix_path + '.fx.npz')
        if rank == 0: print('Loaded fX.')

        type = 'sparse' if sparse else 'dense'
        if rank == 0: print(f'Using {type} cooccurence matrices during training.')
        if not sparse:
            logX = logX.todense()
            fX = fX.todense()
        return logX, fX
Esempio n. 3
0
def real_matrix ():
    matrix_path = "./pickle/matrix/Abby_Watkins.matrix"
    if not os.path.exists(matrix_path):
        print "./pickle/matrix/Abby_Watkins.matrix  ---- not exits"
        os.exits()

    matrix = load_matrix(matrix_path)
    return matrix
Esempio n. 4
0
 def load_graph(self, path):
     matrix = util.load_matrix(path)
     G = nx.Graph()
     length = len(matrix)
     for i in range(length):
         edges_list = [(i, j, matrix[i][j]) for j in range(i+1, length)]
         G.add_weighted_edges_from(edges_list)
     self.G = G
     return None
Esempio n. 5
0
 def load_graph(self, path):
     matrix = util.load_matrix(path)
     G = nx.Graph()
     length = len(matrix)
     for i in range(length):
         edges_list = [(i, j, matrix[i][j]) for j in range(i + 1, length)]
         G.add_weighted_edges_from(edges_list)
     self.G = G
     return None
Esempio n. 6
0
def run(matrix_dir, svd_matrix_dir):
    if not os.path.exists(svd_matrix_dir):
        os.makedirs(svd_matrix_dir)

    for file_name in os.listdir(matrix_dir):
        name = file_name.split('.')[0]
        matrix_path = os.path.join(matrix_dir, file_name)
        matrix = util.load_matrix(matrix_path)
        svd_matrix = get_topicMatrix(np.array(matrix), len(matrix))
        svd_path = os.path.join(svd_matrix_dir, '%s.matrix' % name)
        util.dump_matrix(svd_matrix, svd_path)
    return None
Esempio n. 7
0
def main():
    grid = [[]]
    grid[0].append(util.load_matrix("input.txt"))

    for i in range(6):
        print('.')
        nxt = step2(grid)
        if h(nxt) == h(grid):
            break
        grid = nxt

    print(h(grid).count("#"))
Esempio n. 8
0
def run(matrix_dir, svd_matrix_dir):
    if not os.path.exists(svd_matrix_dir):
        os.makedirs(svd_matrix_dir)

    for file_name in os.listdir(matrix_dir):
        name = file_name.split('.')[0]
        matrix_path = os.path.join(matrix_dir, file_name)
        matrix = util.load_matrix(matrix_path)
        svd_matrix = get_topicMatrix(np.array(matrix), len(matrix))
        svd_path = os.path.join(svd_matrix_dir, '%s.matrix' % name)
        util.dump_matrix(svd_matrix, svd_path)
    return None
Esempio n. 9
0
def main():
    grid = []
    grid.append(util.load_matrix("input.txt"))

    #while True:
    for i in range(6):
        nxt = step2(grid)
        if h(nxt) == h(grid):
            break
        grid = nxt

    print(h(grid).count("#"))
def run(matrix_dir, cosine_dir, similarity_method):
    util.makedir(cosine_dir)

    count = 0
    for file_name in os.listdir(matrix_dir):
        name = file_name.split('.')[0]
        count += 1
        util.write('begin %s: %s' % (count, name))
        file_path = os.path.join(matrix_dir, file_name)
        matrix = util.load_matrix(file_path)
        # sim_matrix = cosine(matrix)
        sim_matrix = compute_similarity(matrix, similarity_method)
        cosine_path = os.path.join(cosine_dir, '%s.matrix' % name)
        util.dump_matrix(sim_matrix, cosine_path)
    return None
Esempio n. 11
0
def run(matrix_dir, category_dir):
    if not os.path.exists(category_dir):
        os.makedirs(category_dir)

    clusterer = GAAClusterer()
    count = 0
    for file_name in os.listdir(matrix_dir):
        name = file_name.split('.')[0]
        count += 1
        print 'begin %s: %s' % (count, name)
        file_path = os.path.join(matrix_dir, file_name)
        matrix = util.load_matrix(file_path)
        np_matrix = [np.array(row) for row in matrix]
        print np_matrix
        result = clusterer.cluster(np_matrix, False, "euc", "mean")
        category_path = os.path.join(category_dir, '%s.pickle' % name)
        with open(category_path, 'wb') as fp:
            pickle.dump(result, fp)
    return None
Esempio n. 12
0
def run(matrix_dir, category_dir):

    if not os.path.exists(category_dir):
        os.makedirs(category_dir)

    clusterer = AD_Cluster()
    count = 0
    for file_name in os.listdir(matrix_dir):
        name = file_name.split('.')[0]
        count += 1
        print 'begin %s: %s' % (count, name)
        file_path = os.path.join(matrix_dir, file_name)
        matrix = util.load_matrix(file_path)

        size = len(matrix)
        result = [(9999,)]*size    # 9999 号类表示 discard, 初始默认所有样本为 discard 状态
        realPTs, noisePTs,tmp,angles,real,noise= cutNoise(matrix)
        print "len of real",len(real)
        print "len of realPTs",len(realPTs)
        # --------------------噪声自成一类-------------------------
        cNum = 0        # 记录已使用类标签数量
        for i in noise:
            result[i] = (cNum,)
            cNum += 1
        # -------------------------------------------------------
        np_matrix = [np.array(row) for row in realPTs]
        print np_matrix
        result0 = clusterer.cluster(np_matrix, False,None, "euc", "mean")

        print "len of result0",len(result0)
        
        print "len of result",len(result)
        # ---------------- 聚类结果标签加上原来已用的类标数 ----------
        for i, c in enumerate(result0): 
             result[real[i]] = (c[0] + cNum,)

        category_path = os.path.join(category_dir, '%s.pickle' % name)
        with open(category_path, 'wb') as fp:
            pickle.dump(result, fp)

        # if count > 5:
        #      break
    return None
Esempio n. 13
0
def run(matrix_dir, category_dir):

    if not os.path.exists(category_dir):
        os.makedirs(category_dir)

    clusterer = AD_Cluster()
    count = 0
    for file_name in os.listdir(matrix_dir):
        name = file_name.split('.')[0]
        count += 1
        print 'begin %s: %s' % (count, name)
        file_path = os.path.join(matrix_dir, file_name)
        matrix = util.load_matrix(file_path)

        size = len(matrix)
        result = [(9999, )] * size  # 9999 号类表示 discard, 初始默认所有样本为 discard 状态
        realPTs, noisePTs, tmp, angles, real, noise = cutNoise(matrix)
        print "len of real", len(real)
        print "len of realPTs", len(realPTs)
        # --------------------噪声自成一类-------------------------
        cNum = 0  # 记录已使用类标签数量
        for i in noise:
            result[i] = (cNum, )
            cNum += 1
        # -------------------------------------------------------
        np_matrix = [np.array(row) for row in realPTs]
        print np_matrix
        result0 = clusterer.cluster(np_matrix, False, None, "euc", "mean")

        print "len of result0", len(result0)

        print "len of result", len(result)
        # ---------------- 聚类结果标签加上原来已用的类标数 ----------
        for i, c in enumerate(result0):
            result[real[i]] = (c[0] + cNum, )

        category_path = os.path.join(category_dir, '%s.pickle' % name)
        with open(category_path, 'wb') as fp:
            pickle.dump(result, fp)

        # if count > 5:
        #      break
    return None
Esempio n. 14
0
def demo():
    from util import load_matrix,draw_line,draw_2D_noise
    points = load_matrix("./pickle/matrix/Alice_Gilbreath.matrix")
    # points = noise_2D_vector_rect(200, 3)
    # points.append((0,0))
    # points.append((0,0))
    vectors = [list(e) for e in points]

    realPTs, noisePTs,tmp,angles,real,noise = cutNoise(vectors)
    print "realPTs",len(realPTs),"real",len(real)
    print "noisePTs,",len(noisePTs),"noise",len(noise)
    print tmp
    print angles
    draw_line(tmp)
    draw_line(angles)
    draw_2D_noise(realPTs, noisePTs)

    cluster1 = AD_Cluster()
    # vectors0 = [np.array(f) for f in points]
    vectors1 = [np.array(f) for f in realPTs]
Esempio n. 15
0
def main(args):
    if torch.cuda.is_available():
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
    print('Device:', device)

    # Set seed for reproducibility.
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    # Construct paths.
    vocab_path = os.path.join(args.vocab_dir, f'{args.name}.vocab')
    matrix_path = os.path.join(args.matrix_dir, f'{args.name}')
    model_path = os.path.join(args.model_dir, f'{args.name}.model.dict')
    log_path = os.path.join(args.log_dir, 'losses.csv')
    out_path = os.path.join(args.out_dir, args.name)

    # Load vocabulary.
    w2i, i2w = load_vocabulary(vocab_path)
    vocab_size = len(i2w)
    print(f'Loaded vocabulary of size {vocab_size}.')
    sparse = bool(vocab_size > 20000)
    # Load co-occurrence data.
    print('Loading co-occurrence matrices...')
    logX = load_matrix(matrix_path + '.logx.npz')
    print('Loaded logX.')
    fX = load_matrix(matrix_path + '.fx.npz')
    print('Loaded fX.')
    type = 'sparse' if sparse else 'dense'
    print(f'Using {type} cooccurence matrices during training.')
    if not sparse:
        logX = logX.todense()
        fX = fX.todense()

    # Construct model and optimizer.
    model = GloVe(vocab_size=vocab_size, emb_dim=args.emb_dim,
                  sparse=True).to(device)
    optimizer = torch.optim.SparseAdam(model.parameters(), lr=args.lr)

    print('Training...')
    losses = []
    logs = [('step', 'loss')]
    lr = args.lr
    epoch = 0
    prev_loss = np.inf
    t0 = time.time()
    try:
        for step in range(1, args.num_updates + 1):

            # Sample a random batch.
            idx = np.random.randint(0,
                                    high=vocab_size,
                                    size=(args.batch_size, ))
            indices = torch.LongTensor(idx).to(device)

            submat = np.ix_(idx, idx)  # used to select the submatrix
            if sparse:
                logx = logX[submat].todense()
                weights = fX[submat].todense()
            else:
                logx = logX[submat]
                weights = fX[submat]

            logx = torch.FloatTensor(logx).to(device)
            weights = torch.FloatTensor(weights).to(device)

            # Forward pass
            loss = model(indices, logx, weights)
            del indices, logx, weights  # free some memory

            # Update parameters
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # Bookkeeping
            losses.append(loss.item())

            if step % args.print_every == 0:
                ls = losses[-args.print_every:]
                avg_loss = sum(ls) / args.print_every
                logs.append((step, avg_loss))
                print(
                    '| epoch {:4d} | step {:6d} | loss {:.4f} | pairs/sec {:.1f} | lr {:.1e}'
                    .format(
                        epoch, step, avg_loss, args.print_every *
                        args.batch_size / (time.time() - t0), lr))
                t0 = time.time()
                if args.use_schedule:
                    if avg_loss >= prev_loss:
                        lr /= 4.0
                        optimizer = torch.optim.SparseAdam(model.parameters(),
                                                           lr=lr)
                    prev_loss = avg_loss
            if step % args.save_every == 0:
                with open(log_path, 'w') as f:
                    writer = csv.writer(f)
                    writer.writerows(logs)
            k, _ = divmod(step * args.batch_size, vocab_size)
            if k > epoch:
                epoch = k
    except KeyboardInterrupt:
        print('-' * 89)
        print('Exiting from training early')

    # Get the learned embeddings from the model.
    embeddings = model.embedding.weight.data.cpu().numpy()
    if args.gensim_format:
        out_path += f'.{args.emb_dim}d.gensim.txt'
    else:
        out_path += f'.{args.emb_dim}d.txt'
    print(f'Writing vectors to `{out_path}`...')
    write_vectors(embeddings, out_path, args.emb_dim, i2w, gensim=True)
Esempio n. 16
0
def main(args):
    """create word vector
    :param file_path: path of corpus
    :param window_size: window size
    :param shift: num of samples in w2v skip-gram negative-sampling(sgns)
    :param dim: the size of wordvec WV = [vocab_size, dim]
    """
    logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO)
    logging.info(f"[INFO] args: {args}")

    logging.info("[INFO] Loading dictionary...")
    id_to_word, word_to_id = load_pickle(args.pickle_id2word)
    vocab_size = len(id_to_word)
    logging.debug(f"[DEBUG] vocab: {vocab_size} words")

    if args.cooccur_pretrained is not None:
        logging.info("[INFO] Loading pre-trained co-occur matrix...")
        C = load_matrix(args.cooccur_pretrained, len(id_to_word))
    else:
        logging.info("[INFO] Creating co-occur matrix...")
        C = create_co_matrix(args.file_path, word_to_id, vocab_size,
                             args.window_size)

        # threshold by min_count
        if args.threshold:
            C = threshold_cooccur(C, threshold=args.threshold)

        os.makedirs("model", exist_ok=True)
        c_name = "model/C_w-{}".format(args.window_size)
        with open(c_name, "w") as wp:
            for id, cooccur_each in enumerate(C):
                cooccur_nonzero = [
                    f"{id}:{c}" for id, c in enumerate(cooccur_each) if c > 0
                ]
                wp.write(f"{id}\t{' '.join(cooccur_nonzero)}\n")

    if args.sppmi_pretrained is not None:
        logging.info("[INFO] Loading pre-trained sppmi matrix...")
        M = load_matrix(args.sppmi_pretrained, len(id_to_word))
    else:
        logging.info("[INFO] Computing sppmi matrix...")
        # use smoothing or not in computing sppmi
        M = sppmi(C,
                  args.shift,
                  has_abs_dis=args.has_abs_dis,
                  has_cds=args.has_cds)
        m_name = "model/SPPMI_w-{}_s-{}".format(args.window_size, args.shift)
        with open(m_name, "w") as wp:
            for id, sppmi_each in enumerate(M):
                sppmi_nonzero = [
                    f"{id}:{m}" for id, m in enumerate(sppmi_each) if m > 0
                ]
                wp.write(f"{id}\t{' '.join(sppmi_nonzero)}\n")

    logging.info("[INFO] Calculating word vector...")
    try:
        from scipy.sparse.linalg import svds

        U, S, V = svds(coo_matrix(M), k=args.dim)
    except:
        U, S, V = np.linalg.svd(coo_matrix(M))

    word_vec = np.dot(U, np.sqrt(np.diag(S)))
    wv_name = "model/WV_d-{}_w-{}_s-{}".format(args.dim, args.window_size,
                                               args.shift)
    np.save(wv_name, word_vec[:, :args.dim])

    return