コード例 #1
0
def fetch_permgnn_embeddings(av,gr: PermGnnGraph):
  avTask = av.TASK

  av.TASK = "PermGNN"
  pickle_fp = "./data/embeddingPickles/"+av.TASK+"_"+av.DATASET_NAME+"_tfrac_"+str(av.TEST_FRAC)+"_vfrac_"+str(av.VAL_FRAC) + "_embedding_mat.pkl"
  if not os.path.exists(pickle_fp):
    query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

    prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)
    device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
    permNet = PermutationGenerator(av,gr).to(device)
    permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
    #if VAL_FRAC is 0, we fetch model weights from last trained epoch
    # else we fetch  best performing model on validation dataset
    if av.VAL_FRAC==0:
      checkpoint = load_model(av)
      logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch'])
    else:
      es = EarlyStoppingModule(av)
      checkpoint = es.load_best_model()
      logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch'])

    permGNN.load_state_dict(checkpoint['model_state_dict'])

    all_nodes = list(range(permGNN.gr.get_num_nodes()))
    all_embeds = cudavar(av,torch.tensor([]))
    for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) :
      batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
      set_size = permGNN.permNet.set_size_all[batch_nodes]
      neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes]
      all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0)

    logger.info("Creating permgnn embedding pickle at %s",pickle_fp)
    with open(pickle_fp, 'wb') as f:
      pickle.dump(all_embeds, f)

  else:
    logger.info("Loading permgnn embedding pickle from %s",pickle_fp)
    with open(pickle_fp, 'rb') as f:
      all_embeds = pickle.load(f)

  av.TASK = avTask
  return cudavar(av,all_embeds)
コード例 #2
0
def performance_analysis(av,gr: PermGnnGraph):
  query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

  prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)
  
  #num_perms = 5
  num_perms = 1
  all_info = generate_global_permutations(av,gr,num_perms) 

  device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
  permNet = PermutationGenerator(av,gr).to(device)
  permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
  #if VAL_FRAC is 0, we fetch model weights from last trained epoch
  # else we fetch  best performing model on validation dataset
  if av.VAL_FRAC==0:
    checkpoint = load_model(av)
    logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch'])
  else:
    es = EarlyStoppingModule(av)
    checkpoint = es.load_best_model()
    logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch'])

  permGNN.load_state_dict(checkpoint['model_state_dict'])

  cos = nn.CosineSimilarity(dim=1, eps=1e-6)
  
  all_nodes = list(range(permGNN.gr.get_num_nodes()))

  canonical_lstm_op = cudavar(av,torch.tensor([]))
  canonical_embeds = cudavar(av,torch.tensor([]))
  #batch and send nodes to avoid memory limit crash for larger graphs
  for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
    batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
    set_size = permGNN.permNet.set_size_all[batch_nodes]
    neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes]
    lstm_op,embeds = permGNN.getEmbeddingForFeatures(set_size,neighbour_features,True)
    canonical_lstm_op = torch.cat((canonical_lstm_op,lstm_op),dim=0)
    canonical_embeds = torch.cat((canonical_embeds,embeds),dim=0)
  canonical_inputs = permGNN.permNet.padded_neighbour_features_all.flatten(1) 
   
  canonical_tr_loss = compute_loss(av,gr,canonical_embeds)   
  
  for sample_frac in [0,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.55,0.6,0.65,0.7,0.75,0.8,0.85,0.9,0.95,1]:
    for perm_type in ['rand','rev']:
      for n_perm in range(num_perms):
        perm_info = all_info[sample_frac][perm_type][n_perm]  
        all_embeds = cudavar(av,torch.tensor([]))
        all_lstm_op = cudavar(av,torch.tensor([]))
        #permute neighbour features
        perm_neighbour_features = []
        for node in range(gr.get_num_nodes()): 
          node_feats_orig = permGNN.permNet.padded_neighbour_features_all[node]
          node_feats_perm = node_feats_orig[torch.tensor(perm_info[node])]
          perm_neighbour_features.append(node_feats_perm)
        perm_neighbour_features = pad_sequence(perm_neighbour_features,batch_first=True)
        #batch and send nodes to avoid memory limit crash for larger graphs
        for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
          batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
          set_size = permGNN.permNet.set_size_all[batch_nodes]
          neighbour_features = perm_neighbour_features[batch_nodes]
          lstm_op,embeds = permGNN.getEmbeddingForFeatures(set_size,neighbour_features,True)
          all_lstm_op = torch.cat((all_lstm_op,lstm_op),dim=0)
          all_embeds = torch.cat((all_embeds,embeds),dim=0)
        
        all_info[sample_frac][perm_type][n_perm]['inputs_sens_score_list'] = cos(canonical_inputs, perm_neighbour_features.flatten(1))
        all_info[sample_frac][perm_type][n_perm]['lstm_op_sens_score_list'] = cos(canonical_lstm_op,all_lstm_op) 
        all_info[sample_frac][perm_type][n_perm]['embeds_sens_score_list'] = cos(canonical_embeds,all_embeds)

        perm_tr_loss = compute_loss(av,gr,all_embeds)
        all_info[sample_frac][perm_type][n_perm]['loss_var'] = abs(perm_tr_loss-canonical_tr_loss)/canonical_tr_loss
  fname = av.DIR_PATH+"/data/KTAU_var_data/" + "Ktau_variation_data"+"_"+av.TASK+"_"+av.DATASET_NAME+"_tfrac_"+str(av.TEST_FRAC)+"_vfrac_"+str(av.VAL_FRAC) + "_data.pkl"
  pickle.dump(all_info,open(fname,"wb"))
コード例 #3
0
def run_graph_lp(av,gr: PermGnnGraph):
  #if av.has_cuda:
  #  torch.cuda.reset_max_memory_allocated(0)
  query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

  prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)

  ###permGNN part starts
  device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
  permNet = PermutationGenerator(av,gr).to(device)
  permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
  
  es = EarlyStoppingModule(av)

  optimizerPerm,optimizerFunc = init_optimizers(av,permGNN)
  starting_epoch = 0 
  
  #if True, load latest epoch model and optimizer state and resume training 
  #if False, train from scratch
  if (av.RESUME_RUN):
    checkpoint = load_model(av)
    permGNN.load_state_dict(checkpoint['model_state_dict'])
    optimizerPerm.load_state_dict(checkpoint['optimizer_perm_state_dict'])
    optimizerFunc.load_state_dict(checkpoint['optimizer_func_state_dict'])
    starting_epoch = checkpoint['epoch'] + 1
    #NOTE:av.RESUME_RUN will becom False, but currently it's unused anywhere else
    av = checkpoint['av']

  nodes = list(range(gr.get_num_nodes()))
  #for epoch in range(starting_epoch,av.NUM_EPOCHS):
  epoch = starting_epoch
  #if VAL_FRAC is 0, we train model for NUM_EPOCHS
  #else we train model till early stopping criteria is met
  while av.VAL_FRAC!=0 or epoch<av.NUM_EPOCHS:
    random.shuffle(nodes)
    if av.TASK != "1Perm" and av.TASK != "Multiperm":
      start_time = time.time()
      set_learnable_parameters(permGNN,isMaxPhase=True)

      epochLoss=0
      for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE):
        nodes_batch = nodes[i:i+av.BATCH_SIZE]
        permGNN.zero_grad()
        loss = -permGNN.computeLoss(nodes_batch)
        if loss==0:
            continue
        loss.backward()
        optimizerPerm.step()
        epochLoss = epochLoss + loss.item()
      score_list = log_scores(av,permGNN,query_nodes,list_val_edges, list_val_non_edges,list_test_edges,list_test_non_edges,start_time,epochLoss,epoch,phase="max")

    start_time = time.time()
    set_learnable_parameters(permGNN,isMaxPhase=False)

    epochLoss = 0 
    for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE):
      nodes_batch = nodes[i:i+av.BATCH_SIZE]
      permGNN.zero_grad()
      loss = permGNN.computeLoss(nodes_batch)
      if loss==0:
          continue
      loss.backward()
      optimizerFunc.step()
      epochLoss = epochLoss + loss.item()       
    score_list = log_scores(av,permGNN,query_nodes,list_val_edges, list_val_non_edges,list_test_edges,list_test_non_edges,start_time,epochLoss,epoch,phase="min")

    save_model(av,permGNN,optimizerPerm, optimizerFunc, epoch, saveAllEpochs=False)
    if av.VAL_FRAC!=0:
      if es.check(score_list,permGNN,epoch):
        break
    epoch+=1
  if av.has_cuda:
    logger.info("Max gpu memory used: %.6f ",torch.cuda.max_memory_allocated(device=0)/(1024**3))
コード例 #4
0
def lp_permute_test_result(av,gr: PermGnnGraph):
  query_nodes, \
    list_training_edges, \
    list_training_non_edges, \
    list_test_edges, \
    list_test_non_edges, \
    list_val_edges, \
    list_val_non_edges = fetch_lp_data_split(av,gr)

  prep_permgnn_graph(av,gr,query_nodes,list_training_edges,list_training_non_edges,list_val_edges,list_test_edges,list_val_non_edges,list_test_non_edges)

  device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
  permNet = PermutationGenerator(av,gr).to(device)
  permGNN = PermutationInvariantGNN(av,gr,permNet).to(device)
  #if VAL_FRAC is 0, we fetch model weights from last trained epoch
  # else we fetch  best performing model on validation dataset
  if av.VAL_FRAC==0:
    checkpoint = load_model(av)
    logger.info("Loading latest trained model from training epoch %d",checkpoint['epoch'])
  else:
    es = EarlyStoppingModule(av)
    checkpoint = es.load_best_model()
    logger.info("Loading best validation result model from training epoch %d",checkpoint['epoch'])

  permGNN.load_state_dict(checkpoint['model_state_dict'])

  logger.info("Test scores  with canonical input sequence")
  start_time = time.time()
  
  all_nodes = list(range(permGNN.gr.get_num_nodes()))
  all_embeds = cudavar(av,torch.tensor([]))
  #batch and send nodes to avoid memory limit crash for larger graphs
  for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
    batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
    set_size = permGNN.permNet.set_size_all[batch_nodes]
    neighbour_features = permGNN.permNet.padded_neighbour_features_all[batch_nodes]
    all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0)

  auc_score, ap_score, map_score, mrr_score = compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges)

  end_time = time.time()
  logger.info("auc_score: %.6f ap_score: %.6f map_score: %.6f mrr_score: %.6f Time: %.2f",auc_score,ap_score,map_score,mrr_score ,end_time-start_time)

  logger.info("Test scores with randomly permuted input sequence")
  for num_run in range(10):
    start_time = time.time()
  
    all_nodes = list(range(permGNN.gr.get_num_nodes()))
    all_embeds = cudavar(av,torch.tensor([]))
    #permute neighbour features
    perm_neighbour_features = pad_sequence([mat[torch.randperm(int(size))] \
                                            for (mat,size) in zip(permGNN.permNet.padded_neighbour_features_all,permGNN.permNet.set_size_all)],\
                                           batch_first=True)
    #batch and send nodes to avoid memory limit crash for larger graphs
    for i in range(0,permGNN.gr.get_num_nodes(),av.BATCH_SIZE) : 
      batch_nodes = all_nodes[i:i+av.BATCH_SIZE]
      set_size = permGNN.permNet.set_size_all[batch_nodes]
      #neighbour_features = permGNN.padded_neighbour_features_all[batch_nodes]
      neighbour_features = perm_neighbour_features[batch_nodes]      
      all_embeds = torch.cat((all_embeds,permGNN.getEmbeddingForFeatures(set_size,neighbour_features).data),dim=0)

    auc_score, ap_score, map_score, mrr_score = compute_scores_from_embeds(av,all_embeds,query_nodes,list_test_edges,list_test_non_edges)

    end_time = time.time()
    logger.info("auc_score: %.6f ap_score: %.6f map_score: %.6f mrr_score: %.6f Time: %.2f",auc_score,ap_score,map_score,mrr_score ,end_time-start_time)
コード例 #5
0
def run_lsh(av, gr: PermGnnGraph):

    all_hashing_info_dict = {}
    all_embeds = fetch_permgnn_embeddings(av, gr)

    pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_" + av.DATASET_NAME + "_tfrac_" + str(
        av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_L1_" + str(
            av.LAMBDA1) + "_L2_" + str(av.LAMBDA2) + "_hashcode_mat.pkl"
    with open(pickle_fp, 'rb') as f:
        all_hashcodes = pickle.load(f)
    all_hashcodes = all_hashcodes.float()

    pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_gaussian_" + av.DATASET_NAME + "_tfrac_" + str(
        av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_hashcode_mat.pkl"
    with open(pickle_fp, 'rb') as f:
        all_hashcodes_gaussian = pickle.load(f)
    all_hashcodes_gaussian = all_hashcodes_gaussian.float()

    query_nodes, \
      list_training_edges, \
      list_training_non_edges, \
      list_test_edges, \
      list_test_non_edges, \
      list_val_edges, \
      list_val_non_edges = fetch_lp_data_split(av,gr)

    prep_permgnn_graph(av, gr, query_nodes, list_training_edges,
                       list_training_non_edges, list_val_edges,
                       list_test_edges, list_val_non_edges,
                       list_test_non_edges)
    #TODO:input hparam suport for d and k
    d = 8
    k = 10

    lsh = LSH(av, gr, 10, d)
    _, _, ap, ndcg, map_sc, mndcg, time_total, time_dict = lsh.get_hash_lp_scores(
        all_embeds, all_hashcodes, query_nodes, list_test_edges,
        list_test_non_edges, k, True)
    node_pair_count = (gr.get_num_nodes() * (gr.get_num_nodes() - 1)) / 2
    test_pair_count = len(list_test_edges) + len(list_test_non_edges)
    all_hashing_info_dict['nohash'] = [
        node_pair_count, test_pair_count, ap, ndcg, map_sc, mndcg, time_total,
        time_dict['end_score_computation'] -
        time_dict['start_score_computation'],
        time_dict['end_heap_procedure'] - time_dict['start_heap_procedure'],
        time_dict['end_candidate_list_gen'] -
        time_dict['start_candidate_list_gen']
    ]

    lsh.init_candidate_set(list_test_edges, list_test_non_edges)

    lsh.init_hash_code_mat(all_hashcodes)
    lsh.bucketify()
    len_test, len_candidate, ap, ndcg, map_sc, mndcg, time_total, time_dict = lsh.get_hash_lp_scores(
        all_embeds, all_hashcodes, query_nodes, list_test_edges,
        list_test_non_edges, k, False)
    all_hashing_info_dict['trained'] = [
        len_test, len_candidate, ap, ndcg, map_sc, mndcg, time_total,
        time_dict['end_score_computation'] -
        time_dict['start_score_computation'],
        time_dict['end_heap_procedure'] - time_dict['start_heap_procedure'],
        time_dict['end_candidate_list_gen'] -
        time_dict['start_candidate_list_gen']
    ]

    lsh.init_hash_code_mat(all_hashcodes_gaussian)
    lsh.bucketify()
    len_test, len_candidate, ap, ndcg, map_sc, mndcg, time_total, time_dict = lsh.get_hash_lp_scores(
        all_embeds, all_hashcodes_gaussian, query_nodes, list_test_edges,
        list_test_non_edges, k, False)
    all_hashing_info_dict['gaussian'] = [
        len_test, len_candidate, ap, ndcg, map_sc, mndcg, time_total,
        time_dict['end_score_computation'] -
        time_dict['start_score_computation'],
        time_dict['end_heap_procedure'] - time_dict['start_heap_procedure'],
        time_dict['end_candidate_list_gen'] -
        time_dict['start_candidate_list_gen']
    ]

    #print in file
    fp = "data/logDir/hashing_info_" + av.DATASET_NAME + "_l1_" + str(
        av.LAMBDA1) + "_l2_" + str(av.LAMBDA2) + ".txt"
    f = open(fp, 'w+')
    logger.info("Writing results to file %s", fp)
    for ver in ['nohash', 'trained', 'gaussian']:
        info = all_hashing_info_dict[ver]
        f.write(
            "version: {} node_pair_count: {} test_pair_count: {}  ap: {} ndcg: {} map: {} mndcg: {} time: {} time_candidate_list_gen: {} time_scoring: {} time_heaping: {}"
            .format(ver, info[0], info[1], info[2], info[3], info[4], info[5],
                    info[6], info[9], info[7], info[8]))
        f.write('\n')
コード例 #6
0
def run_graph_lp_hash(av, gr: PermGnnGraph):
    pickle_fp = av.DIR_PATH + "/data/hashcodePickles/" + av.TASK + "_" + av.DATASET_NAME + "_tfrac_" + str(
        av.TEST_FRAC) + "_vfrac_" + str(av.VAL_FRAC) + "_L1_" + str(
            av.LAMBDA1) + "_L2_" + str(av.LAMBDA2) + "_hashcode_mat.pkl"
    if not os.path.exists(pickle_fp):
        #if av.has_cuda:
        #  torch.cuda.reset_max_memory_allocated(0)
        #fetch permGNN embeddings
        device = "cuda" if av.has_cuda and av.want_cuda else "cpu"
        query_nodes, \
          list_training_edges, \
          list_training_non_edges, \
          list_test_edges, \
          list_test_non_edges, \
          list_val_edges, \
          list_val_non_edges = fetch_lp_data_split(av,gr)

        prep_permgnn_graph(av, gr, query_nodes, list_training_edges,
                           list_training_non_edges, list_val_edges,
                           list_test_edges, list_val_non_edges,
                           list_test_non_edges)
        all_embeds = fetch_permgnn_embeddings(av, gr)

        hashCodeGenerator = HashCodeGenerator(av, gr).to(device)
        hashCodeGenerator.init_embeddings(all_embeds)
        hashCodeGenerator.init_non_nbr_mat(list_training_edges)

        es = EarlyStoppingModule(av, 50, 0.001)

        optimizerFunc = torch.optim.SGD(hashCodeGenerator.parameters(),
                                        lr=av.LEARNING_RATE_FUNC)
        nodes = list(range(gr.get_num_nodes()))
        epoch = 0
        #if VAL_FRAC is 0, we train model for NUM_EPOCHS
        #else we train model till early stopping criteria is met
        while av.VAL_FRAC != 0 or epoch < av.NUM_EPOCHS:
            random.shuffle(nodes)
            start_time = time.time()
            totalEpochLoss = 0
            for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE):
                nodes_batch = nodes[i:i + av.BATCH_SIZE]
                hashCodeGenerator.zero_grad()
                loss1, loss2, loss3, num_nodes = hashCodeGenerator.computeLoss(
                    nodes_batch)
                totalLoss = (av.LAMBDA1 / num_nodes) * loss1 + (
                    av.LAMBDA2 / num_nodes) * loss2 + (
                        (1 -
                         (av.LAMBDA1 + av.LAMBDA2)) / (num_nodes**2)) * loss3
                totalLoss.backward()
                optimizerFunc.step()
                totalEpochLoss = totalEpochLoss + totalLoss.item()
            end_time = time.time()
            logger.info("Epoch: %d totalEpochLoss: %f time: %.2f", epoch,
                        totalEpochLoss, end_time - start_time)
            if av.VAL_FRAC != 0:
                if es.check([-totalEpochLoss], hashCodeGenerator, epoch):
                    break
            epoch += 1
        if av.has_cuda:
            logger.info("Max gpu memory used: %.6f ",
                        torch.cuda.max_memory_allocated(device=0) / (1024**3))

        #generate and dump hashcode  pickles
        all_nodes = list(range(gr.get_num_nodes()))
        all_hashcodes = cudavar(av, torch.tensor([]))
        for i in range(0, gr.get_num_nodes(), av.BATCH_SIZE):
            batch_nodes = all_nodes[i:i + av.BATCH_SIZE]
            all_hashcodes = torch.cat(
                (all_hashcodes, hashCodeGenerator.forward(batch_nodes).data),
                dim=0)
        logger.info("Dumping trained hashcode pickle at %s", pickle_fp)
        with open(pickle_fp, 'wb') as f:
            pickle.dump(all_hashcodes, f)