Esempio n. 1
0
def main(args):
    """ Train GAE """
    print("Using {} dataset".format(args.dataset_str))
    # Load data
    np.random.seed(1)
    adj, features = load_data(args.dataset_str)
    N, D = features.shape

    # Store original adjacency matrix (without diagonal entries)
    adj_orig = adj
    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)

    # Some preprocessing
    adj_train_norm = preprocess_graph(adj_train)
    adj_train_norm = Variable(make_sparse(adj_train_norm))
    adj_train_labels = Variable(
        torch.FloatTensor(adj_train + sp.eye(adj_train.shape[0]).todense()))
    features = Variable(make_sparse(features))

    n_edges = adj_train_labels.sum()

    data = {
        'adj_norm': adj_train_norm,
        'adj_labels': adj_train_labels,
        'features': features,
    }

    gae = GAE(data,
              n_hidden=32,
              n_latent=16,
              dropout=args.dropout,
              subsampling=args.subsampling)

    optimizer = Adam({"lr": args.lr, "betas": (0.95, 0.999)})

    svi = SVI(gae.model, gae.guide, optimizer, loss="ELBO")

    # Results
    results = defaultdict(list)

    # Full batch training loop
    for epoch in range(args.num_epochs):
        # initialize loss accumulator
        epoch_loss = 0.
        # do ELBO gradient and accumulate loss
        epoch_loss += svi.step()
        # report training diagnostics
        if args.subsampling:
            normalized_loss = epoch_loss / float(2 * n_edges)
        else:
            normalized_loss = epoch_loss / (2 * N * N)

        results['train_elbo'].append(normalized_loss)

        # Training loss
        emb = gae.get_embeddings()
        accuracy, roc_curr, ap_curr, = eval_gae(val_edges, val_edges_false,
                                                emb, adj_orig)

        results['accuracy_train'].append(accuracy)
        results['roc_train'].append(roc_curr)
        results['ap_train'].append(ap_curr)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(normalized_loss), "train_acc=",
              "{:.5f}".format(accuracy), "val_roc=", "{:.5f}".format(roc_curr),
              "val_ap=", "{:.5f}".format(ap_curr))

        # Test loss
        if epoch % args.test_freq == 0:
            emb = gae.get_embeddings()
            accuracy, roc_score, ap_score = eval_gae(test_edges,
                                                     test_edges_false, emb,
                                                     adj_orig)
            results['accuracy_test'].append(accuracy)
            results['roc_test'].append(roc_curr)
            results['ap_test'].append(ap_curr)

    print("Optimization Finished!")

    # Test loss
    emb = gae.get_embeddings()
    accuracy, roc_score, ap_score = eval_gae(test_edges, test_edges_false, emb,
                                             adj_orig)
    print('Test Accuracy: ' + str(accuracy))
    print('Test ROC score: ' + str(roc_score))
    print('Test AP score: ' + str(ap_score))

    # Plot
    plot_results(results,
                 args.test_freq,
                 path=args.dataset_str + "_results.png")
Esempio n. 2
0
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

else:
    # Load data
    adj, features = load_data(dataset_str)
    print("Loaded dataset")
    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)

adj = adj_train

if FLAGS.features == 0:
    features = sp.identity(features.shape[0])  # featureless

# Some preprocessing
adj_norm = preprocess_graph(adj)

# Define placeholders
placeholders = {
    'features': tf.sparse_placeholder(tf.float32),
    'adj': tf.sparse_placeholder(tf.float32),
    'adj_orig': tf.sparse_placeholder(tf.float32),
    'dropout': tf.placeholder_with_default(0., shape=())
Esempio n. 3
0
    if dataset_str == 'synthetic':
        adj, features = get_synthetic_data(p=p, attrNoise=attrNoise, m=m)
    else:
        adj, features = load_data(dataset_str)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj  # sparse matrix
    # adj_orig.diagonal()[np.newaxis, :] row vector
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]),
        shape=adj_orig.shape)  # set the diagnal elements to 0

    adj_orig.eliminate_zeros(
    )  # sparse matrix should not contain entries equals 0. So always call eliminate_zeros() after an update.

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj, test_percent=10., val_percent=5.)
    adj = adj_train  # This is the adj matrix that masked out all validation and testing entries.
    #print(adj_train.shape)
    #import pdb;pdb.set_trace()

    if FLAGS.features == 0:
        features = sp.identity(
            features.shape[0])  # featureless. sparse coo_matrix.

    # Some preprocessing
    #adj_norm = preprocess_graph(adj)

    attn_adj_norm = adj + sp.eye(adj.shape[0])
    attn_adj_norm = sparse_to_tuple(attn_adj_norm)  # a tuple

    adj_norm = preprocess_graph(
Esempio n. 4
0
def format_data(data_name):
    # Load data

    adj, features, y_test, tx, ty, test_maks, true_labels = load_data(
        data_name)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    if FLAGS.features == 0:
        features = sp.identity(features.shape[0])  # featureless

    # Some preprocessing
    adj_norm = preprocess_graph(adj)

    num_nodes = adj.shape[0]

    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)
    items = [
        adj, num_features, num_nodes, features_nonzero, pos_weight, norm,
        adj_norm, adj_label, features, true_labels, train_edges, val_edges,
        val_edges_false, test_edges, test_edges_false, adj_orig
    ]
    feas = {}
    for item in items:
        # item_name = [ k for k,v in locals().iteritems() if v == item][0]
        feas[retrieve_name(item)] = item

    return feas
Esempio n. 5
0
def main(args):
    
    dataset = args.dataset
    emb_output_dir = args.output
    epochs = args.epochs
    agg = args.agg
    p = args.p
    tr = args.tr
    lam = args.lam
    lose_func = args.loss

    # Preprocess dataset
    adj, views_features = load_data(dataset, num_views=3)
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()
    # Calculate pairwise simlarity.
    views_sim_matrix = {}
    views_feature_matrix = {}

    for view in list(views_features.keys()):
        feature_matrix = csc_matrix.todense(views_features[view])
        views_feature_matrix.update({view:feature_matrix})
 
    kernal = "rbf"
    if lose_func == 'all':
        attr_sim = cal_attr_sim(views_feature_matrix, dataset)
    else:
        attr_sim = 0

    # split nodes to train, valid and test datasets, 
    # remove test edges from train adjacent matrix. 
    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(dataset, adj)
    
    print("Masking edges Done!")
    adj = adj_train
    nx_G = nx.from_numpy_array(adj.toarray())
    num_nodes = adj.shape[0]
    adj_norm = preprocess_graph(adj)

    views_features_num = {}
    views_features_nonzero = {}
    for view in list(views_features.keys()):
        views_features[view] = sparse_to_tuple(views_features[view].tocoo())
        views_features_num.update({view:views_features[view][2][1]})
        views_features_nonzero.update({view:views_features[view][1].shape[0]})
    
    # Build model
    MagCAE = {}
    for view in list(views_features.keys()):
        x,y = views_features[view][2][0], views_features[view][2][1]
        model = GAE(y, views_features_nonzero[view], adj_norm, math.ceil(2*p*y), math.ceil(p*y))
        MagCAE.update({view:model})

    # Loss function and optimizer.
    # loss weight taken by each nodes to the total loss.
    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) /adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(adj.shape[0] * adj.shape[0] - adj.sum())*2
    optimizer = tf.keras.optimizers.Adam()

    adj_targ = adj_train + sp.eye(adj_train.shape[0])
    adj_targ = sparse_to_tuple(adj_targ)

    indices= np.array(adj_targ[0])
    values = np.array(adj_targ[1])
    dense_shape = np.array(adj_targ[2])
    sparse_targ = tf.SparseTensor(indices = indices,
                                    values = values,
                                    dense_shape = dense_shape)
    sparse_targ = tf.cast(sparse_targ, dtype=tf.float32)

    adj_targ = tf.sparse.to_dense(sparse_targ)
    adj_targ = tf.reshape(adj_targ,[-1])
    # Train and Evaluate Model
    # Training Loop:
    # In each epoch: views - > view_embedding -> aggregate embedding -> total loss ->  update gradients
    decoder = Decoder(100)

    for epoch in range(epochs):
        loss = 0
        start = time.time()

        with tf.GradientTape() as tape:
            ag_embedding ={}


            for VAE in list(MagCAE.keys()):
                v_embedding, a_hat = MagCAE[VAE](views_features[VAE])
                ag_embedding.update({VAE:v_embedding})

            # aggregate embeddings
            embedding, aggregator = aggregate_embeddings(ag_embedding, agg)
            # reconstruct a_hat
            a_hat = decoder(embedding)
            loss += loss_function(a_hat, adj_targ, pos_weight, norm, attr_sim, embedding, num_nodes, lam, lose_func)

        if agg == "weighted_concat":
            variables = MagCAE['view1'].trainable_variables + MagCAE['view2'].trainable_variables + MagCAE['view3'].trainable_variables + aggregator.trainable_variables

        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

        # Evaluate on validate set
        embedding = np.array(embedding)
        roc_cur, ap_cur, _, _ = evaluate(val_edges, val_edges_false, adj_orig, embedding)

        print("Epoch {}: Val_Roc {:.4f}, Val_AP {:.4f}, Time Consumed {:.2f} sec\n".format(epoch+1, roc_cur, ap_cur, time.time()-start))

    print("Training Finished!")
    
    # Evaluation Result on test Edges
    test_embedding= {}
    for VAE in list(MagCAE.keys()):
        v_embedding, a_hat = MagCAE[VAE](views_features[VAE])
        test_embedding.update({VAE:v_embedding})

    # aggregate embeddings
    embedding, aggregator = aggregate_embeddings(test_embedding, agg)
    embedding = np.array(embedding) # embedding is a tensor, convert to np array.

    # reconstruct a_hat
    test_roc, test_ap, fpr, tpr = evaluate(test_edges, test_edges_false, adj_orig, embedding)
    print("MagCAE test result on {}".format(dataset))
    print("Test Roc: {}, Test AP: {}, P: {}, Training Ratio: {}, Lambda: {}.".format(test_roc, test_ap, p, tr, lam))
Esempio n. 6
0
x = sp.lil_matrix(features)
features_tuple = sparse_to_tuple(x)
features_shape = features_tuple[2]
# Get graph attributes (to feed into model)
num_nodes = adj.shape[0]  # number of nodes in adjacency matrix
num_features = features_shape[
    1]  # number of features (columsn of features matrix)
features_nonzero = features_tuple[1].shape[
    0]  # number of non-zero entries in features matrix (or length of values list)
# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix(
    (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()
np.random.seed(0)  # IMPORTANT: guarantees consistent train/test splits
adj_train, train_edges, train_edges_false, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
    adj, test_frac=.3, val_frac=.1)

# Normalize adjacency matrix
adj_norm = preprocess_graph(adj_train)

# Add in diagonals
adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = sparse_to_tuple(adj_label)
# Inspect train/test split
print("Total nodes:", adj.shape[0])
print("Total edges:", int(
    adj.nnz / 2))  # adj is symmetric, so nnz (num non-zero) = 2*num_edges
print("Training edges (positive):", len(train_edges))
print("Training edges (negative):", len(train_edges_false))
print("Validation edges (positive):", len(val_edges))
print("Validation edges (negative):", len(val_edges_false))
Esempio n. 7
0
def main(args):
    """ Train GAE """

    # Compute the device upon which to run
    device = torch.device("cuda" if args.use_cuda else "cpu")

    print("Using {} dataset".format(args.dataset_str))
    # Load data
    np.random.seed(1)
    adj, features = load_data(args.dataset_str)
    N, D = features.shape

    # Store original adjacency matrix (without diagonal entries)
    adj_orig = adj
    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)

    # Some preprocessing
    adj_train_norm = preprocess_graph(adj_train)
    adj_train_norm = make_sparse(adj_train_norm)
    adj_train_labels = torch.FloatTensor(adj_train +
                                         sp.eye(adj_train.shape[0]).todense())
    features = make_sparse(features)

    n_edges = adj_train_labels.sum()

    data = {
        'adj_norm': adj_train_norm,
        'adj_labels': adj_train_labels,
        'features': features,
    }

    gae = GAE(data, n_hidden=32, n_latent=16, dropout=args.dropout)

    # Send the model and data to the available device
    gae.to(device)
    data['adj_norm'] = data['adj_norm'].to(device)
    data['adj_labels'] = data['adj_labels'].to(device)
    data['features'] = data['features'].to(device)

    optimizer = optim.Adam(gae.parameters(),
                           lr=args.lr,
                           betas=(0.95, 0.999),
                           weight_decay=args.weight_decay)

    # Results
    results = defaultdict(list)

    # Full batch training loop
    for epoch in range(args.num_epochs):

        t = time.time()
        gae.train()
        optimizer.zero_grad()

        # forward pass
        output = gae(data['features'], data['adj_norm'])

        # Compute the loss
        logits = output
        targets = data['adj_labels']
        loss = gae.norm * F.binary_cross_entropy_with_logits(
            logits, targets, pos_weight=gae.pos_weight)

        loss.backward()
        optimizer.step()

        results['train_elbo'].append(loss.item())

        gae.eval()
        emb = gae.get_embeddings(data['features'], data['adj_norm'])
        accuracy, roc_curr, ap_curr, = eval_gae(val_edges, val_edges_false,
                                                emb, adj_orig)
        results['accuracy_train'].append(accuracy)
        results['roc_train'].append(roc_curr)
        results['ap_train'].append(ap_curr)

        print("Epoch:", '%04d' % (epoch + 1), "train_loss=",
              "{:.5f}".format(loss.item()), "train_acc=",
              "{:.5f}".format(accuracy), "val_roc=", "{:.5f}".format(roc_curr),
              "val_ap=", "{:.5f}".format(ap_curr))

        # Test loss
        if epoch % args.test_freq == 0:
            with torch.no_grad():
                gae.eval()
                emb = gae.get_embeddings(data['features'], data['adj_norm'])
                accuracy, roc_score, ap_score = eval_gae(
                    test_edges, test_edges_false, emb, adj_orig)
                results['accuracy_test'].append(accuracy)
                results['roc_test'].append(roc_curr)
                results['ap_test'].append(ap_curr)
            gae.train()

    print("Optimization Finished!")

    with torch.no_grad():
        # Test loss
        gae.eval()
        emb = emb = gae.get_embeddings(data['features'], data['adj_norm'])
        accuracy, roc_score, ap_score = eval_gae(test_edges, test_edges_false,
                                                 emb, adj_orig)
        print('Test Accuracy: ' + str(accuracy))
        print('Test ROC score: ' + str(roc_score))
        print('Test AP score: ' + str(ap_score))

    # Plot
    plot_results(results,
                 args.test_freq,
                 path=args.dataset_str + "_GAE_results.png")
Esempio n. 8
0
def format_data(data_name):
    # Load data

    adj, features, true_labels = load_data(data_name)

    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix(
        (adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(
        adj)
    adj = adj_train

    if FLAGS.features == 0:
        features = sp.identity(features.shape[0])  # featureless

    # Some preprocessing
    adj_norm = preprocess_graph(adj)

    num_nodes = adj.shape[0]

    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float(
        (adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    adj_label = adj_train + 2 * sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)
    feas = {}
    feas['adj'] = adj
    feas['num_features'] = num_features
    feas['num_nodes'] = num_nodes
    feas['features_nonzero'] = features_nonzero
    feas['pos_weight'] = pos_weight
    feas['norm'] = norm
    feas['adj_norm'] = adj_norm
    feas['adj_label'] = adj_label
    feas['features'] = features
    feas['true_labels'] = true_labels
    feas['train_edges'] = train_edges
    feas['val_edges'] = val_edges
    feas['val_edges_false'] = val_edges_false
    feas['test_edges'] = test_edges
    feas['test_edges_false'] = test_edges_false
    feas['adj_orig'] = adj_orig

    return feas
Esempio n. 9
0
def format_data_ui(data_name, has_features=1):
    # Load data

    fpath_dir = '../data/useritem/%s/' % data_name
    fpath_input = '%sinput.pkl' % fpath_dir
    with open(fpath_input, 'rb') as f:
        (n_users, n_items, item_features, train, valid, test) = pkl.load(
            f)  # here features is not the returned features
    ui_graph = defaultdict(list)
    ii_graph = defaultdict(set)
    ii_graph_list = defaultdict(list)  # dict()
    for edge, value in train.items():
        u, i = edge
        ui_graph[u].append(i)
    #
    edge_dict = defaultdict(int)
    tmp_u_number = len(ui_graph)
    for index, (u, ilist) in enumerate(ui_graph.items()):

        if index % 500 == 0:
            print('user number: %d/%d' % (index, tmp_u_number))
        for i in ilist:
            for j in ilist:
                # ii_graph[i].add(j)
                if i != j:
                    edge_dict[(i, j)] += 1
        if len(edge_dict) % 5000 == 0:
            print('len(edge_dict):%d' % len(edge_dict))

    print('len(edge_dict):%d' % len(edge_dict))
    edge_visit_thresh = 2

    for edge, visit_num in edge_dict.items():
        i1, i2 = edge
        if visit_num >= edge_visit_thresh:
            ii_graph_list[i1].append(i2)  # = list(iset)
    print('%s:get ii mat' % (datetime.datetime.now().isoformat()))
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(ii_graph_list))
    print('adj shape:', adj.get_shape())

    # features: lil_matrix
    features = item_features.tolil()

    # true_labels: the neighbor truth : not used for me and arga...
    true_labels = None

    # --transform over, now follows the original procedure
    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    adj = adj_train
    print('%s:mask test edges over' % (datetime.datetime.now().isoformat()))
    if has_features == 0:
        features = sp.identity(features.shape[0])  # featureless

    # Some preprocessing
    adj_norm = preprocess_graph(adj)

    num_nodes = adj.shape[0]

    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)
    items = [adj, num_features, num_nodes, features_nonzero, pos_weight, norm, adj_norm, adj_label, features,
             true_labels, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, adj_orig]
    feas = {}
    for item in items:
        feas[retrieve_name(item)] = item

    return feas
Esempio n. 10
0
def format_data_ui_concat(data_name, has_features=1):
    '''
    concat u and i ==> get a u-u mat and its map 
    '''
    # Load data

    fpath_dir = '../data/useritem/%s/' % data_name
    fpath_input = '%sinput.pkl' % fpath_dir
    with open(fpath_input, 'rb') as f:
        (n_users, n_items, item_features, train, valid, test) = pkl.load(
            f)  # here features is not the returned features
    ui_graph = defaultdict(list)
    ii_graph = defaultdict(set)
    ii_graph_list = defaultdict(list)  # dict()

    user_set = set()
    item_set = set()
    tag_set = set()

    for edge, value in train.items():
        u, i = edge
        user_set.add(u)
        item_set.add(i)

    for edge, value in valid.items():
        u, i = edge
        user_set.add(u)
        item_set.add(i)

    for edge, value in test.items():
        u, i = edge
        user_set.add(u)
        item_set.add(i)

    # check if n_users is from [0, n_users-1]
    print(n_users)
    print('user:len, min, max:', len(user_set), min(user_set), max(user_set))
    print(n_items)
    print('item:len, min, max:', len(item_set), min(item_set), max(item_set))

    max_user_index_plus1 = max(user_set) + 1
    user_plus_item_num = (max_user_index_plus1 + max(item_set)) + 1
    new_ui_edge_dict = defaultdict(list)

    for edge, value in train.items():
        u, i = edge
        new_ui_edge_dict[u].append(i + max_user_index_plus1)
        new_ui_edge_dict[i + max_user_index_plus1].append(u)

    print('%s:get ii mat' % (datetime.datetime.now().isoformat()))
    G_ui = nx.from_dict_of_lists(new_ui_edge_dict)
    G_ui_nodes_list = list(G_ui.nodes())
    adj = nx.adjacency_matrix(G_ui)
    ui_to_ui_index_dict = dict()  # include u and i
    for i in range(len(G_ui_nodes_list)):
        ui_to_ui_index_dict[G_ui_nodes_list[i]] = i

    print('adj shape:', adj.get_shape())

    tag_set = set()
    for (item, tag), value in item_features.items():
        tag_set.add(tag)
    max_tag_num = max(tag_set) + 1
    item_features_mapped = sp.dok_matrix((user_plus_item_num, max_tag_num), dtype=np.int64)

    unused_item_cnt = 0  # no user item info's item
    for (item, tag), value in item_features.items():
        # not used item
        item_mapped = item + max_user_index_plus1
        if item_mapped not in ui_to_ui_index_dict:
            unused_item_cnt += 1
        else:
            item_features_mapped[ui_to_ui_index_dict[item_mapped], tag] = value

    print('unused_item_cnt: %d' % unused_item_cnt)

    features = item_features_mapped.tolil()  # item_features.tolil()
    # map to new position
    # true_labels: the neighbor truth : not used for me and arga...
    true_labels = None

    # --transform over, now follows the original procedure
    # Store original adjacency matrix (without diagonal entries) for later
    adj_orig = adj
    adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]),
                                        shape=adj_orig.shape)  # to remove adj_matirx's diag,  offset is 0
    adj_orig.eliminate_zeros()

    adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
    adj = adj_train
    print('%s:mask test edges over' % (datetime.datetime.now().isoformat()))
    # if FLAGS.features == 0:
    if has_features == 0:
        features = sp.identity(features.shape[0])  # featureless #just diag have 1

    # Some preprocessing
    adj_norm = preprocess_graph(adj)

    num_nodes = adj.shape[0]

    features = sparse_to_tuple(features.tocoo())
    num_features = features[2][1]
    features_nonzero = features[1].shape[0]

    pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
    norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

    adj_label = adj_train + sp.eye(adj_train.shape[0])
    adj_label = sparse_to_tuple(adj_label)
    items = [adj, num_features, num_nodes, features_nonzero, pos_weight, norm, adj_norm, adj_label, features,
             true_labels, train_edges, val_edges, val_edges_false, test_edges, test_edges_false, adj_orig,
             ui_to_ui_index_dict, max_user_index_plus1]  # add ui
    feas = {}
    for item in items:
        feas[retrieve_name(item)] = item

    return feas
Esempio n. 11
0
    def val_test_data_gen(self):
        """Loop over the graph time sequence and compute a random set as a test set"""

        self.val_edges_list = []
        self.val_edges_false_list = []
        self.test_edges_list = []
        self.test_edges_false_list = []

        # new edges
        self.all_pos_edge_set = []
        self.new_edges_list = []
        self.new_edges_false_list = []

        # Loop over the sequence length. 
        # So if seq_len is 30, i will be 0...29 basically every graph in the time series
        for i in range(self.args.seq_len):

            val_test_graph, _ = load_adj_graph(f'{self.data_loc}_t{i}.npz')
            val_test_graph_adj, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(val_test_graph, test_percent=30., val_percent=20.)
            self.val_edges_list.append(val_edges)
            self.val_edges_false_list.append(val_edges_false)
            self.test_edges_list.append(test_edges)
            self.test_edges_false_list.append(test_edges_false)
            pos_edges = np.concatenate((val_edges, test_edges, train_edges)).tolist()
            self.all_pos_edge_set.append(set(map(tuple, pos_edges)))

        # Look over sequence again to get the new edges at each time point
        for i in range(self.args.seq_len):
            if i == 0: # on first loop do nothing
                self.new_edges_list.append(None)
                self.new_edges_false_list.append(None)

            # new edges since the last time step
            new_edges = np.array(list(self.all_pos_edge_set[i] - self.all_pos_edge_set[i-1]))
            if len(new_edges) == 0: # if the edge list is empty
                self.new_edges_list.append(None)
                self.new_edges_false_list.append(None)
            else:
                num_edges = len(new_edges)
                self.new_edges_list.append(new_edges)
                self.new_edges_false_list.append(self.test_edges_false_list[i][:num_edges])

        print("Validation and Test edges captured from last graph in the sequence")

        # Set the number of vertices in the graph
        self.num_nodes = val_test_graph.shape[0]
Esempio n. 12
0
def link_pred_emb(p=8,
                  q=0.5,
                  win_size=10,
                  num_walks=10,
                  walk_length=20,
                  dimension=55,
                  iter=1,
                  rocfile="Plots/roctest.png",
                  result_file_path="results/parameters.txt") -> None:
    """The main function. Link prediction is done here."""

    # Load pickled (adj, feat) tuple
    with open(os.path.join(NETWORK_DIR, PICKLE_FILE), "rb") as file:
        adj, features = pickle.load(file)
    with open(os.path.join(NETWORK_DIR, ID_MAP_FILE), "rb") as file:
        id_map = pickle.load(file)

    g = nx.Graph(adj)  # Recreate graph using node indices (0 to num_nodes-1)
    # Draw the network
    # nx.draw_networkx(g, with_labels=False, node_size=50, node_color="r")
    # plt.show()

    # Preprocessing (train/test split)
    np.random.seed(0)  # make sure train-test split is consistent
    adj_sparse = nx.to_scipy_sparse_matrix(g)
    # Perform train-test split
    (
        adj_train,
        train_edges,
        train_edges_false,
        val_edges,
        val_edges_false,
        test_edges,
        test_edges_false,
    ) = mask_test_edges(adj_sparse, test_frac=0.3, val_frac=0.1)

    # new graph object with only non-hidden edges
    g_train = nx.from_scipy_sparse_matrix(adj_train)

    # Inspect train/test split
    print("Total nodes:", adj_sparse.shape[0])

    # adj is symmetric, so nnz (num non-zero) = 2 * num_edges
    print("Total edges:", int(adj_sparse.nnz / 2))
    print("Training edges (positive):", len(train_edges))
    print("Training edges (negative):", len(train_edges_false))
    print("Validation edges (positive):", len(val_edges))
    print("Validation edges (negative):", len(val_edges_false))
    print("Test edges (positive):", len(test_edges))
    print("Test edges (negative):", len(test_edges_false))

    # Train node2vec (Learn Node Embeddings)

    # node2vec settings
    # NOTE: When p = q = 1, this is equivalent to DeepWalk

    # P = 5   # Return hyperparameter
    # Q = 0.65  # In-out hyperparameter
    # WINDOW_SIZE = 10  # Context size for optimization
    # NUM_WALKS = 5  # Number of walks per source
    # WALK_LENGTH = 5  # Length of walk per source
    # DIMENSIONS = 128  # Embedding dimension
    # DIRECTED = False  # Graph directed/undirected
    # WORKERS = 8  # Num. parallel workers
    # ITER = 1  # SGD epochs

    P = p  # Return hyperparameter
    Q = q  # In-out hyperparameter
    WINDOW_SIZE = win_size  # Context size for optimization
    NUM_WALKS = num_walks  # Number of walks per source
    WALK_LENGTH = walk_length  # Length of walk per source
    DIMENSIONS = dimension  # Embedding dimension
    DIRECTED = False  # Graph directed/undirected
    WORKERS = 8  # Num. parallel workers
    ITER = iter  # SGD epochs

    # Preprocessing, generate walks

    # create node2vec graph instance
    g_n2v = node2vec.Graph(g_train, DIRECTED, P, Q)
    g_n2v.preprocess_transition_probs()
    walks = g_n2v.simulate_walks(NUM_WALKS, WALK_LENGTH)
    walks = [list(map(str, walk)) for walk in walks]

    # Train skip-gram model
    model = Word2Vec(
        walks,
        size=DIMENSIONS,
        window=WINDOW_SIZE,
        min_count=0,
        sg=1,
        workers=WORKERS,
        iter=ITER,
    )

    # Store embeddings mapping
    emb_mappings = model.wv
    model.wv.save_word2vec_format('Neo-Emb-2.emd')

    # Create node embeddings matrix (rows = nodes, columns = embedding features)
    emb_list = []
    for node_index in range(0, adj_sparse.shape[0]):
        node_str = str(node_index)
        node_emb = emb_mappings[node_str]
        emb_list.append(node_emb)
    emb_matrix = np.vstack(emb_list)

    def get_edge_embeddings(edge_list):
        """
        Generate bootstrapped edge embeddings (as is done in node2vec paper)
        Edge embedding for (v1, v2) = hadamard product of node embeddings for
        v1, v2.
        """
        embs = []
        for edge in edge_list:
            node1 = edge[0]
            node2 = edge[1]
            emb1 = emb_matrix[node1]
            emb2 = emb_matrix[node2]
            edge_emb = np.multiply(emb1, emb2)
            embs.append(edge_emb)
        embs = np.array(embs)
        return embs

    # Train-set edge embeddings
    pos_train_edge_embs = get_edge_embeddings(train_edges)
    neg_train_edge_embs = get_edge_embeddings(train_edges_false)
    train_edge_embs = np.concatenate(
        [pos_train_edge_embs, neg_train_edge_embs])

    # Create train-set edge labels: 1 = real edge, 0 = false edge
    train_edge_labels = np.concatenate(
        [np.ones(len(train_edges)),
         np.zeros(len(train_edges_false))])

    # Val-set edge embeddings, labels
    pos_val_edge_embs = get_edge_embeddings(val_edges)
    neg_val_edge_embs = get_edge_embeddings(val_edges_false)
    val_edge_embs = np.concatenate([pos_val_edge_embs, neg_val_edge_embs])
    val_edge_labels = np.concatenate(
        [np.ones(len(val_edges)),
         np.zeros(len(val_edges_false))])

    # Test-set edge embeddings, labels
    pos_test_edge_embs = get_edge_embeddings(test_edges)
    neg_test_edge_embs = get_edge_embeddings(test_edges_false)
    test_edge_embs = np.concatenate([pos_test_edge_embs, neg_test_edge_embs])

    # Create val-set edge labels: 1 = real edge, 0 = false edge
    test_edge_labels = np.concatenate(
        [np.ones(len(test_edges)),
         np.zeros(len(test_edges_false))])

    # Train logistic regression classifier on train-set edge embeddings
    #edge_classifier = LogisticRegression(random_state=0)
    edge_classifier = RandomForestClassifier(max_depth=10, random_state=0)
    edge_classifier.fit(train_edge_embs, train_edge_labels)

    # Predicted edge scores: probability of being of class "1" (real edge)
    val_preds = edge_classifier.predict_proba(val_edge_embs)[:, 1]
    val_roc = roc_auc_score(val_edge_labels, val_preds)
    val_ap = average_precision_score(val_edge_labels, val_preds)

    # Predicted edge scores: probability of being of class "1" (real edge)
    test_preds = edge_classifier.predict_proba(test_edge_embs)[:, 1]
    test_roc = roc_auc_score(test_edge_labels, test_preds)
    test_ap = average_precision_score(test_edge_labels, test_preds)

    result_file = open(result_file_path, "w")
    for para, value in emb_paras.items():
        if para != "roc_file":
            result_file.write(para + "  :  " + str(value))
            result_file.write("\n")

    for para, value in tuning_para.items():
        if para != "fig_path":
            result_file.write(para + "  :  " + str(value))
            result_file.write("\n")

    result_file.write("node2vec Validation ROC score: " + str(val_roc))
    result_file.write("\n")
    result_file.write("node2vec Validation AP score: " + str(val_ap))
    result_file.write("\n")
    result_file.write("node2vec Test ROC score: " + str(test_roc))
    result_file.write("\n")
    result_file.write("node2vec Test AP score: " + str(test_ap))
    result_file.write("\n")
    silhouette_score, purity_score = cluster(**tuning_para)
    result_file.write("silhouette score:" + str(silhouette_score))
    result_file.write("\n")
    result_file.write("purity score:" + str(purity_score))
    result_file.write("\n")
    result_file.close()
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    fpr, tpr, _ = roc_curve(test_edge_labels, test_preds)
    roc_auc = auc(fpr, tpr)
    lw = 2
    plt.plot(fpr,
             tpr,
             color='darkorange',
             lw=lw,
             label='%s (area = %0.2f)' % ('RF', roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('test')
    plt.legend(loc='lower right')

    plt.savefig(rocfile)
    plt.close()