Python normalize_featuresの例

プログラミング言語: Python

名前空間/パッケージ名: preprocessing

メソッド/関数: normalize_features

hotexamples.comのコード掲載数: 6

Python normalize_features - 6件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのpreprocessing.normalize_featuresの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

コード例 #1

ファイルを表示

num_side_features = 0

# feature loading
if not FEATURES:
    u_features = sp.identity(num_users, format='csr')
    v_features = sp.identity(num_items, format='csr')

    u_features, v_features = preprocess_user_item_features(
        u_features, v_features)

elif FEATURES and u_features is not None and v_features is not None:
    # use features as side information and node_id's as node input features

    print("Normalizing feature vectors...")
    u_features_side = normalize_features(u_features)
    v_features_side = normalize_features(v_features)

    u_features_side, v_features_side = preprocess_user_item_features(
        u_features_side, v_features_side)

    u_features_side = np.array(u_features_side.todense(), dtype=np.float32)
    v_features_side = np.array(v_features_side.todense(), dtype=np.float32)

    num_side_features = u_features_side.shape[1]

    # node id's for node input features
    id_csr_v = sp.identity(num_items, format='csr')
    id_csr_u = sp.identity(num_users, format='csr')

    u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v)

コード例 #2

ファイルを表示

def run(user_features, movie_features, learning_rate=0.01, epochs=500, hidden=[500, 75], feat_hidden=64, accumulation='sum', dropout=0.7,
        num_basis_functions=2, features=False, symmetric=True, testing=True):
  """accumulation can be sum or stack"""

  # Set random seed
  # seed = 123 # use only for unit testing
  seed = int(time.time())
  np.random.seed(seed)
  tf.set_random_seed(seed)
  tf.reset_default_graph()

  # Settings
  # ap = argparse.ArgumentParser()
  # # ap.add_argument("-d", "--dataset", type=str, default="ml_100k",
  # #               choices=['ml_100k', 'ml_1m', 'ml_10m', 'douban', 'yahoo_music', 'flixster'],
  # #               help="Dataset string.")

  # ap.add_argument("-lr", "--learning_rate", type=float, default=0.01,
  #                 help="Learning rate")

  # ap.add_argument("-e", "--epochs", type=int, default=2500,
  #                 help="Number training epochs")

  # ap.add_argument("-hi", "--hidden", type=int, nargs=2, default=[500, 75],
  #                 help="Number hidden units in 1st and 2nd layer")

  # ap.add_argument("-fhi", "--feat_hidden", type=int, default=64,
  #                 help="Number hidden units in the dense layer for features")

  # ap.add_argument("-ac", "--accumulation", type=str, default="sum", choices=['sum', 'stack'],
  #                 help="Accumulation function: sum or stack.")

  # ap.add_argument("-do", "--dropout", type=float, default=0.7,
  #                 help="Dropout fraction")

  # ap.add_argument("-nb", "--num_basis_functions", type=int, default=2,
  #                 help="Number of basis functions for Mixture Model GCN.")

  # ap.add_argument("-ds", "--data_seed", type=int, default=1234,
  #                 help="""Seed used to shuffle data in data_utils, taken from cf-nade (1234, 2341, 3412, 4123, 1324).
  #                      Only used for ml_1m and ml_10m datasets. """)

  # ap.add_argument("-sdir", "--summaries_dir", type=str, default='logs/' + str(datetime.datetime.now()).replace(' ', '_'),
  #                 help="Directory for saving tensorflow summaries.")

  # # Boolean flags
  # fp = ap.add_mutually_exclusive_group(required=False)
  # fp.add_argument('-nsym', '--norm_symmetric', dest='norm_symmetric',
  #                 help="Option to turn on symmetric global normalization", action='store_true')
  # fp.add_argument('-nleft', '--norm_left', dest='norm_symmetric',
  #                 help="Option to turn on left global normalization", action='store_false')
  # ap.set_defaults(norm_symmetric=True)

  # fp = ap.add_mutually_exclusive_group(required=False)
  # fp.add_argument('-f', '--features', dest='features',
  #                 help="Whether to use features (1) or not (0)", action='store_true')
  # fp.add_argument('-no_f', '--no_features', dest='features',
  #                 help="Whether to use features (1) or not (0)", action='store_false')
  # ap.set_defaults(features=False)

  # fp = ap.add_mutually_exclusive_group(required=False)
  # fp.add_argument('-ws', '--write_summary', dest='write_summary',
  #                 help="Option to turn on summary writing", action='store_true')
  # fp.add_argument('-no_ws', '--no_write_summary', dest='write_summary',
  #                 help="Option to turn off summary writing", action='store_false')
  # ap.set_defaults(write_summary=False)

  # fp = ap.add_mutually_exclusive_group(required=False)
  # fp.add_argument('-t', '--testing', dest='testing',
  #                 help="Option to turn on test set evaluation", action='store_true')
  # fp.add_argument('-v', '--validation', dest='testing',
  #                 help="Option to only use validation set evaluation", action='store_false')
  # ap.set_defaults(testing=False)


  # args = vars(ap.parse_args())

  # print('Settings:')
  # print(args, '\n')

  # Define parameters
  DATASET = 'ml_100k'
  DATASEED = 1234
  NB_EPOCH = epochs
  DO = dropout
  HIDDEN = hidden
  FEATHIDDEN = feat_hidden
  BASES = num_basis_functions
  LR = learning_rate
  WRITESUMMARY = False
  SUMMARIESDIR = 'logs/' + str(datetime.datetime.now()).replace(' ', '_')
  FEATURES = features
  SYM = symmetric
  TESTING = testing
  ACCUM = accumulation

  SELFCONNECTIONS = False
  SPLITFROMFILE = True
  VERBOSE = True

  NUMCLASSES = 5

  # Splitting dataset in training, validation and test set

  print("Using official MovieLens dataset split u1.base/u1.test with 20% validation set size...")
  u_features = user_features
  v_features = movie_features
  _, _, adj_train, train_labels, train_u_indices, train_v_indices, \
  val_labels, val_u_indices, val_v_indices, test_labels, \
  test_u_indices, test_v_indices, class_values = load_official_trainvaltest_split('ml_100k', TESTING)


  num_users, num_items = adj_train.shape

  num_side_features = 0

  # feature loading
  if not FEATURES:
      u_features = sp.identity(num_users, format='csr')
      v_features = sp.identity(num_items, format='csr')

      u_features, v_features = preprocess_user_item_features(u_features, v_features)

  elif FEATURES and u_features is not None and v_features is not None:
      # use features as side information and node_id's as node input features

      print("Normalizing feature vectors...")
      u_features_side = normalize_features(u_features)
      v_features_side = normalize_features(v_features)

      u_features_side, v_features_side = preprocess_user_item_features(u_features_side, v_features_side)

      u_features_side = np.array(u_features_side.todense(), dtype=np.float32)
      v_features_side = np.array(v_features_side.todense(), dtype=np.float32)

      num_side_features = u_features_side.shape[1]

      # node id's for node input features
      id_csr_v = sp.identity(num_items, format='csr')
      id_csr_u = sp.identity(num_users, format='csr')

      u_features, v_features = preprocess_user_item_features(id_csr_u, id_csr_v)

  else:
      raise ValueError('Features flag is set to true but no features are loaded from dataset ' + DATASET)


  # global normalization
  support = []
  support_t = []
  adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32)

  for i in range(NUMCLASSES):
      # build individual binary rating matrices (supports) for each rating
      support_unnormalized = sp.csr_matrix(adj_train_int == i + 1, dtype=np.float32)

      if support_unnormalized.nnz == 0 and DATASET != 'yahoo_music':
          # yahoo music has dataset split with not all ratings types present in training set.
          # this produces empty adjacency matrices for these ratings.
          sys.exit('ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!')

      support_unnormalized_transpose = support_unnormalized.T
      support.append(support_unnormalized)
      support_t.append(support_unnormalized_transpose)


  support = globally_normalize_bipartite_adjacency(support, symmetric=SYM)
  support_t = globally_normalize_bipartite_adjacency(support_t, symmetric=SYM)

  if SELFCONNECTIONS:
      support.append(sp.identity(u_features.shape[0], format='csr'))
      support_t.append(sp.identity(v_features.shape[0], format='csr'))

  num_support = len(support)
  support = sp.hstack(support, format='csr')
  support_t = sp.hstack(support_t, format='csr')

  if ACCUM == 'stack':
      div = HIDDEN[0] // num_support
      if HIDDEN[0] % num_support != 0:
          print("""\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that
                    it can be evenly split in %d splits.\n""" % (HIDDEN[0], num_support * div, num_support))
      HIDDEN[0] = num_support * div

  # Collect all user and item nodes for test set
  test_u = list(set(test_u_indices))
  test_v = list(set(test_v_indices))
  test_u_dict = {n: i for i, n in enumerate(test_u)}
  test_v_dict = {n: i for i, n in enumerate(test_v)}

  test_u_indices = np.array([test_u_dict[o] for o in test_u_indices])
  test_v_indices = np.array([test_v_dict[o] for o in test_v_indices])

  test_support = support[np.array(test_u)]
  test_support_t = support_t[np.array(test_v)]

  # Collect all user and item nodes for validation set
  val_u = list(set(val_u_indices))
  val_v = list(set(val_v_indices))
  val_u_dict = {n: i for i, n in enumerate(val_u)}
  val_v_dict = {n: i for i, n in enumerate(val_v)}

  val_u_indices = np.array([val_u_dict[o] for o in val_u_indices])
  val_v_indices = np.array([val_v_dict[o] for o in val_v_indices])

  val_support = support[np.array(val_u)]
  val_support_t = support_t[np.array(val_v)]

  # Collect all user and item nodes for train set
  train_u = list(set(train_u_indices))
  train_v = list(set(train_v_indices))
  train_u_dict = {n: i for i, n in enumerate(train_u)}
  train_v_dict = {n: i for i, n in enumerate(train_v)}

  train_u_indices = np.array([train_u_dict[o] for o in train_u_indices])
  train_v_indices = np.array([train_v_dict[o] for o in train_v_indices])

  train_support = support[np.array(train_u)]
  train_support_t = support_t[np.array(train_v)]

  # features as side info
  if FEATURES:
      test_u_features_side = u_features_side[np.array(test_u)]
      test_v_features_side = v_features_side[np.array(test_v)]

      val_u_features_side = u_features_side[np.array(val_u)]
      val_v_features_side = v_features_side[np.array(val_v)]

      train_u_features_side = u_features_side[np.array(train_u)]
      train_v_features_side = v_features_side[np.array(train_v)]

  else:
      test_u_features_side = None
      test_v_features_side = None

      val_u_features_side = None
      val_v_features_side = None

      train_u_features_side = None
      train_v_features_side = None


  placeholders = {
      'u_features': tf.sparse_placeholder(tf.float32, shape=np.array(u_features.shape, dtype=np.int64)),
      'v_features': tf.sparse_placeholder(tf.float32, shape=np.array(v_features.shape, dtype=np.int64)),
      'u_features_nonzero': tf.placeholder(tf.int32, shape=()),
      'v_features_nonzero': tf.placeholder(tf.int32, shape=()),
      'labels': tf.placeholder(tf.int32, shape=(None,)),

      'u_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)),
      'v_features_side': tf.placeholder(tf.float32, shape=(None, num_side_features)),

      'user_indices': tf.placeholder(tf.int32, shape=(None,)),
      'item_indices': tf.placeholder(tf.int32, shape=(None,)),

      'class_values': tf.placeholder(tf.float32, shape=class_values.shape),

      'dropout': tf.placeholder_with_default(0., shape=()),
      'weight_decay': tf.placeholder_with_default(0., shape=()),

      'support': tf.sparse_placeholder(tf.float32, shape=(None, None)),
      'support_t': tf.sparse_placeholder(tf.float32, shape=(None, None)),
  }

  # create model
  if FEATURES:
      model = RecommenderSideInfoGAE(placeholders,
                                     input_dim=u_features.shape[1],
                                     feat_hidden_dim=FEATHIDDEN,
                                     num_classes=NUMCLASSES,
                                     num_support=num_support,
                                     self_connections=SELFCONNECTIONS,
                                     num_basis_functions=BASES,
                                     hidden=HIDDEN,
                                     num_users=num_users,
                                     num_items=num_items,
                                     accum=ACCUM,
                                     learning_rate=LR,
                                     num_side_features=num_side_features,
                                     logging=True)
  else:
      model = RecommenderGAE(placeholders,
                             input_dim=u_features.shape[1],
                             num_classes=NUMCLASSES,
                             num_support=num_support,
                             self_connections=SELFCONNECTIONS,
                             num_basis_functions=BASES,
                             hidden=HIDDEN,
                             num_users=num_users,
                             num_items=num_items,
                             accum=ACCUM,
                             learning_rate=LR,
                             logging=True)

  # Convert sparse placeholders to tuples to construct feed_dict
  test_support = sparse_to_tuple(test_support)
  test_support_t = sparse_to_tuple(test_support_t)

  val_support = sparse_to_tuple(val_support)
  val_support_t = sparse_to_tuple(val_support_t)

  train_support = sparse_to_tuple(train_support)
  train_support_t = sparse_to_tuple(train_support_t)

  u_features = sparse_to_tuple(u_features)
  v_features = sparse_to_tuple(v_features)
  assert u_features[2][1] == v_features[2][1], 'Number of features of users and items must be the same!'

  num_features = u_features[2][1]
  u_features_nonzero = u_features[1].shape[0]
  v_features_nonzero = v_features[1].shape[0]

  # Feed_dicts for validation and test set stay constant over different update steps
  train_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero,
                                        v_features_nonzero, train_support, train_support_t,
                                        train_labels, train_u_indices, train_v_indices, class_values, DO,
                                        train_u_features_side, train_v_features_side)
  # No dropout for validation and test runs
  val_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero,
                                      v_features_nonzero, val_support, val_support_t,
                                      val_labels, val_u_indices, val_v_indices, class_values, 0.,
                                      val_u_features_side, val_v_features_side)

  test_feed_dict = construct_feed_dict(placeholders, u_features, v_features, u_features_nonzero,
                                       v_features_nonzero, test_support, test_support_t,
                                       test_labels, test_u_indices, test_v_indices, class_values, 0.,
                                       test_u_features_side, test_v_features_side)


  # Collect all variables to be logged into summary
  merged_summary = tf.summary.merge_all()

  #sess = tf.Session()
  sess = tf.InteractiveSession()

  sess.run(tf.global_variables_initializer())

  if WRITESUMMARY:
      train_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/train', sess.graph)
      val_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/val')
  else:
      train_summary_writer = None
      val_summary_writer = None

  best_val_score = np.inf
  best_val_loss = np.inf
  best_epoch = 0
  wait = 0

  print('Training...')

  train_loss_values = []
  train_rmse_values = []
  val_loss_values = []
  val_rmse_values = []
  list_embeddings = []

  for epoch in range(NB_EPOCH):

      t = time.time()

      # Run single weight update
      # outs = sess.run([model.opt_op, model.loss, model.rmse], feed_dict=train_feed_dict)
      # with exponential moving averages
      outs = sess.run([model.training_op, model.loss, model.rmse], feed_dict=train_feed_dict)

  
      #print(len(model.embeddings))
        
        
      train_avg_loss = outs[1]
      train_rmse = outs[2]

      val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict)

      train_loss_values.append(train_avg_loss)
      train_rmse_values.append(train_rmse)
      val_loss_values.append(val_avg_loss)
      val_rmse_values.append(val_rmse)

      if VERBOSE:
          print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(train_avg_loss),
                "train_rmse=", "{:.5f}".format(train_rmse),
                "val_loss=", "{:.5f}".format(val_avg_loss),
                "val_rmse=", "{:.5f}".format(val_rmse),
                "\t\ttime=", "{:.5f}".format(time.time() - t))

      if epoch==NB_EPOCH - 1:
          embedding_users = model.embeddings[0].eval(feed_dict=train_feed_dict)
          embedding_movies = model.embeddings[1].eval(feed_dict=train_feed_dict)

      if val_rmse < best_val_score:
          best_val_score = val_rmse
          best_epoch = epoch

      if epoch % 20 == 0 and WRITESUMMARY:
          # Train set summary
          summary = sess.run(merged_summary, feed_dict=train_feed_dict)
          train_summary_writer.add_summary(summary, epoch)
          train_summary_writer.flush()

          # Validation set summary
          summary = sess.run(merged_summary, feed_dict=val_feed_dict)
          val_summary_writer.add_summary(summary, epoch)
          val_summary_writer.flush()

      if epoch % 100 == 0 and epoch > 1000 and not TESTING and False:
          saver = tf.train.Saver()
          save_path = saver.save(sess, "tmp/%s_seed%d.ckpt" % (model.name, DATASEED), global_step=model.global_step)

          # load polyak averages
          variables_to_restore = model.variable_averages.variables_to_restore()
          saver = tf.train.Saver(variables_to_restore)
          saver.restore(sess, save_path)

          val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict)

          print('polyak val loss = ', val_avg_loss)
          print('polyak val rmse = ', val_rmse)

          # Load back normal variables
          saver = tf.train.Saver()
          saver.restore(sess, save_path)


  # store model including exponential moving averages
  saver = tf.train.Saver()
  save_path = saver.save(sess, "tmp/%s.ckpt" % model.name, global_step=model.global_step)


  if VERBOSE:
      print("\nOptimization Finished!")
      print('best validation score =', best_val_score, 'at iteration', best_epoch+1)


  if TESTING:
      test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict)
      print('test loss = ', test_avg_loss)
      print('test rmse = ', test_rmse)

      # restore with polyak averages of parameters
      variables_to_restore = model.variable_averages.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)
      saver.restore(sess, save_path)

      test_avg_loss, test_rmse = sess.run([model.loss, model.rmse], feed_dict=test_feed_dict)
      print('polyak test loss = ', test_avg_loss)
      print('polyak test rmse = ', test_rmse)

  else:
      # restore with polyak averages of parameters
      variables_to_restore = model.variable_averages.variables_to_restore()
      saver = tf.train.Saver(variables_to_restore)
      saver.restore(sess, save_path)

      val_avg_loss, val_rmse = sess.run([model.loss, model.rmse], feed_dict=val_feed_dict)
      print('polyak val loss = ', val_avg_loss)
      print('polyak val rmse = ', val_rmse)

  print('global seed = ', seed)

  sess.close()

  return embedding_users, embedding_movies, train_loss_values, train_rmse_values, val_loss_values, val_rmse_values

コード例 #3

ファイルを表示

def run(DATASET='douban',
        DATASEED=1234,
        random_seed=123,
        NB_EPOCH=200,
        DO=0,
        HIDDEN=[100, 75],
        FEATHIDDEN=64,
        LR=0.01,
        decay_rate=1.25,
        consecutive_threshold=5,
        FEATURES=False,
        SYM=True,
        TESTING=False,
        ACCUM='stackRGGCN',
        NUM_LAYERS=1,
        GCMC_INDICES=False):
    np.random.seed(random_seed)
    tf.set_random_seed(random_seed)

    SELFCONNECTIONS = False
    SPLITFROMFILE = True
    VERBOSE = False
    BASES = 2
    WRITESUMMARY = False
    SUMMARIESDIR = 'logs/'

    if DATASET == 'ml_1m' or DATASET == 'ml_100k' or DATASET == 'douban':
        NUMCLASSES = 5
    elif DATASET == 'ml_10m':
        NUMCLASSES = 10
        print(
            '\n WARNING: this might run out of RAM, consider using train_minibatch.py for dataset %s'
            % DATASET)
        print(
            'If you want to proceed with this option anyway, uncomment this.\n'
        )
        sys.exit(1)
    elif DATASET == 'flixster':
        NUMCLASSES = 10
    elif DATASET == 'yahoo_music':
        NUMCLASSES = 71
        if ACCUM == 'sum':
            print(
                '\n WARNING: combining DATASET=%s with ACCUM=%s can cause memory issues due to large number of classes.'
            )
            print(
                'Consider using "--accum stack" as an option for this dataset.'
            )
            print(
                'If you want to proceed with this option anyway, uncomment this.\n'
            )
            sys.exit(1)

    # Splitting dataset in training, validation and test set

    if DATASET == 'ml_1m' or DATASET == 'ml_10m':
        if FEATURES:
            datasplit_path = 'data/' + DATASET + '/withfeatures_split_seed' + str(
                DATASEED) + '.pickle'
        else:
            datasplit_path = 'data/' + DATASET + '/split_seed' + str(
                DATASEED) + '.pickle'
    elif FEATURES:
        datasplit_path = 'data/' + DATASET + '/withfeatures.pickle'
    else:
        datasplit_path = 'data/' + DATASET + '/nofeatures.pickle'

    if DATASET == 'flixster' or DATASET == 'douban' or DATASET == 'yahoo_music':
        u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \
         val_labels, val_u_indices, val_v_indices, test_labels, \
         test_u_indices, test_v_indices, class_values = load_data_monti(DATASET, TESTING)

    elif DATASET == 'ml_100k':
        print(
            "Using official MovieLens dataset split u1.base/u1.test with 20% validation set size..."
        )
        u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \
         val_labels, val_u_indices, val_v_indices, test_labels, \
         test_u_indices, test_v_indices, class_values = load_official_trainvaltest_split(DATASET, TESTING)
    else:
        print("Using random dataset split ...")
        u_features, v_features, adj_train, train_labels, train_u_indices, train_v_indices, \
         val_labels, val_u_indices, val_v_indices, test_labels, \
         test_u_indices, test_v_indices, class_values = create_trainvaltest_split(DATASET, DATASEED, TESTING,
                            datasplit_path, SPLITFROMFILE,
                            VERBOSE)

    num_users, num_items = adj_train.shape
    num_side_features = 0

    # feature loading
    if not FEATURES:
        u_features = sp.identity(
            num_users, format='csr')  # features is just one-hot vector!
        v_features = sp.identity(num_items, format='csr')

        u_features, v_features = preprocess_user_item_features(
            u_features, v_features)

    elif FEATURES and u_features is not None and v_features is not None:
        # use features as side information and node_id's as node input features

        print("Normalizing feature vectors...")
        u_features_side = normalize_features(u_features)
        v_features_side = normalize_features(v_features)

        u_features_side, v_features_side = preprocess_user_item_features(
            u_features_side, v_features_side)

        u_features_side = np.array(u_features_side.todense(), dtype=np.float32)
        v_features_side = np.array(v_features_side.todense(), dtype=np.float32)

        num_side_features = u_features_side.shape[1]

        # node id's for node input features
        id_csr_v = sp.identity(num_items, format='csr')
        id_csr_u = sp.identity(num_users, format='csr')

        u_features, v_features = preprocess_user_item_features(
            id_csr_u, id_csr_v)

    else:
        raise ValueError(
            'Features flag is set to true but no features are loaded from dataset '
            + DATASET)

    # print("User features shape: " + str(u_features.shape))
    # print("Item features shape: " + str(v_features.shape))
    # print("adj_train shape: " + str(adj_train.shape))

    # global normalization
    support = []
    support_t = []
    adj_train_int = sp.csr_matrix(adj_train, dtype=np.int32)

    for i in range(NUMCLASSES):
        # build individual binary rating matrices (supports) for each rating
        support_unnormalized = sp.csr_matrix(adj_train_int == i + 1,
                                             dtype=np.float32)

        if support_unnormalized.nnz == 0 and DATASET != 'yahoo_music':
            # yahoo music has dataset split with not all ratings types present in training set.
            # this produces empty adjacency matrices for these ratings.
            sys.exit(
                'ERROR: normalized bipartite adjacency matrix has only zero entries!!!!!'
            )

        support_unnormalized_transpose = support_unnormalized.T
        support.append(support_unnormalized)
        support_t.append(support_unnormalized_transpose)

    support = globally_normalize_bipartite_adjacency(support, symmetric=SYM)
    support_t = globally_normalize_bipartite_adjacency(support_t,
                                                       symmetric=SYM)

    if SELFCONNECTIONS:
        support.append(sp.identity(u_features.shape[0], format='csr'))
        support_t.append(sp.identity(v_features.shape[0], format='csr'))

    num_support = len(support)
    support = sp.hstack(support, format='csr')
    support_t = sp.hstack(support_t, format='csr')
    # support and support_t become 3000x15000 (for douban with 3000 users/items and 5 ratings)
    # support is n_users x (n_items*n_ratings). support_t is n_items x (n_users*ratings)
    # NOTE: support is sparse matrix so the shape may not be as large as expected (?)
    # When is num_support ever not == num_rating_classes?
    # print('support shape: ' + str(support.shape))
    # print('support_t shape: ' + str(support_t.shape))

    if ACCUM == 'stack' or ACCUM == 'stackRGGCN':
        div = HIDDEN[0] // num_support
        if HIDDEN[0] % num_support != 0:
            print(
                """\nWARNING: HIDDEN[0] (=%d) of stack layer is adjusted to %d such that
					  it can be evenly split in %d splits.\n""" %
                (HIDDEN[0], num_support * div, num_support))
        HIDDEN[0] = num_support * div

    ##################################################################################################################
    """ support contains only training set ratings. index into support using user/item indices to create test set support. """
    test_support = val_support = train_support = support
    test_support_t = val_support_t = train_support_t = support_t

    if GCMC_INDICES:
        # Collect all user and item nodes for test set
        test_u = list(set(test_u_indices))
        test_v = list(set(test_v_indices))
        test_support = support[np.array(test_u)]
        test_support_t = support_t[np.array(test_v)]

        # Collect all user and item nodes for validation set
        val_u = list(set(val_u_indices))
        val_v = list(set(val_v_indices))
        val_support = support[np.array(val_u)]
        val_support_t = support_t[np.array(val_v)]

        # Collect all user and item nodes for train set
        train_u = list(set(train_u_indices))
        train_v = list(set(train_v_indices))
        train_support = support[np.array(train_u)]
        train_support_t = support_t[np.array(train_v)]

        test_u_dict = {n: i for i, n in enumerate(test_u)}
        test_v_dict = {n: i for i, n in enumerate(test_v)}
        test_u_indices = np.array([test_u_dict[o] for o in test_u_indices])
        test_v_indices = np.array([test_v_dict[o] for o in test_v_indices])

        val_u_dict = {n: i for i, n in enumerate(val_u)}
        val_v_dict = {n: i for i, n in enumerate(val_v)}
        val_u_indices = np.array([val_u_dict[o] for o in val_u_indices])
        val_v_indices = np.array([val_v_dict[o] for o in val_v_indices])

        train_u_dict = {n: i for i, n in enumerate(train_u)}
        train_v_dict = {n: i for i, n in enumerate(train_v)}
        print('max train_u_indices: {}'.format(max(train_u_indices)))
        train_u_indices = np.array(
            [train_u_dict[o] for o in train_u_indices]
        )  ### HERE IS WHERE indices get changed to suit the new indexing into smaller set of users
        train_v_indices = np.array([train_v_dict[o] for o in train_v_indices])
        print('max train_u_indices after: {}'.format(max(train_u_indices)))

    # print('train_support_shape: {}'.format(train_support.shape)) # if GCMC_INDICES, THIS IS NO LONGER (n_users, n_items*n_rating_types). but < n_users
    ##################################################################################################################

    # features as side info
    if FEATURES:
        test_u_features_side = u_features_side[np.array(test_u)]
        test_v_features_side = v_features_side[np.array(test_v)]

        val_u_features_side = u_features_side[np.array(val_u)]
        val_v_features_side = v_features_side[np.array(val_v)]

        train_u_features_side = u_features_side[np.array(train_u)]
        train_v_features_side = v_features_side[np.array(train_v)]

    else:
        test_u_features_side = None
        test_v_features_side = None

        val_u_features_side = None
        val_v_features_side = None

        train_u_features_side = None
        train_v_features_side = None

    placeholders = {
        'u_features':
        tf.sparse_placeholder(tf.float32,
                              shape=np.array(u_features.shape,
                                             dtype=np.int64)),
        'v_features':
        tf.sparse_placeholder(tf.float32,
                              shape=np.array(v_features.shape,
                                             dtype=np.int64)),
        'u_features_nonzero':
        tf.placeholder(tf.int32, shape=()),
        'v_features_nonzero':
        tf.placeholder(tf.int32, shape=()),
        'labels':
        tf.placeholder(tf.int32, shape=(None, )),
        'u_features_side':
        tf.placeholder(tf.float32, shape=(None, num_side_features)),
        'v_features_side':
        tf.placeholder(tf.float32, shape=(None, num_side_features)),
        'user_indices':
        tf.placeholder(tf.int32, shape=(None, )),
        'item_indices':
        tf.placeholder(tf.int32, shape=(None, )),
        'class_values':
        tf.placeholder(tf.float32, shape=class_values.shape),
        'dropout':
        tf.placeholder_with_default(0., shape=()),
        'weight_decay':
        tf.placeholder_with_default(0., shape=()),
        'support':
        tf.sparse_placeholder(tf.float32, shape=(None, None)),
        'support_t':
        tf.sparse_placeholder(tf.float32, shape=(None, None)),
    }

    ##################################################################################################################
    E_start, E_end = get_edges_matrices(adj_train)
    # E_start = sp.hstack(E_start, format='csr')  # confirm if vstack is correct and not hstack
    # E_end = sp.hstack(E_end, format='csr')

    # placeholders['E_start'] = tf.sparse_placeholder(tf.float32, shape=(None, None, None))
    # placeholders['E_end'] = tf.sparse_placeholder(tf.float32, shape=(None, None, None))

    placeholders['E_start_list'] = []
    placeholders['E_end_list'] = []
    for i in range(num_support):
        placeholders['E_start_list'].append(
            tf.sparse_placeholder(tf.float32, shape=(None, None)))
        placeholders['E_end_list'].append(
            tf.sparse_placeholder(tf.float32, shape=(None, None)))

    # print('shape of E_end for first rating type: {}'.format(E_end[0].toarray().shape))

    ##################################################################################################################

    # create model
    if FEATURES:
        model = RecommenderSideInfoGAE(placeholders,
                                       input_dim=u_features.shape[1],
                                       feat_hidden_dim=FEATHIDDEN,
                                       num_classes=NUMCLASSES,
                                       num_support=num_support,
                                       self_connections=SELFCONNECTIONS,
                                       num_basis_functions=BASES,
                                       hidden=HIDDEN,
                                       num_users=num_users,
                                       num_items=num_items,
                                       accum=ACCUM,
                                       learning_rate=LR,
                                       num_side_features=num_side_features,
                                       logging=True)
    else:
        model = RecommenderGAE(placeholders,
                               input_dim=u_features.shape[1],
                               num_classes=NUMCLASSES,
                               num_support=num_support,
                               self_connections=SELFCONNECTIONS,
                               num_basis_functions=BASES,
                               hidden=HIDDEN,
                               num_users=num_users,
                               num_items=num_items,
                               accum=ACCUM,
                               learning_rate=LR,
                               num_layers=NUM_LAYERS,
                               logging=True)

    # Convert sparse placeholders to tuples to construct feed_dict. sparse placeholders expect tuple of (indices, values, shape)
    test_support = sparse_to_tuple(test_support)
    test_support_t = sparse_to_tuple(test_support_t)

    val_support = sparse_to_tuple(val_support)
    val_support_t = sparse_to_tuple(val_support_t)

    train_support = sparse_to_tuple(train_support)
    train_support_t = sparse_to_tuple(train_support_t)

    u_features = sparse_to_tuple(u_features)
    v_features = sparse_to_tuple(v_features)
    assert u_features[2][1] == v_features[2][
        1], 'Number of features of users and items must be the same!'

    num_features = u_features[2][1]
    u_features_nonzero = u_features[1].shape[0]
    v_features_nonzero = v_features[1].shape[0]

    # setting E_start to be the same for train, val, and test. E_start already only contains train edges (from preprocessing script)
    train_E_start = []
    train_E_end = []
    # print('LENGTH OF E_START: {}'.format(len(E_start)))
    # print('NUM_SUPPORT: {}'.format(num_support))
    for i in range(num_support):
        train_E_start.append(sparse_to_tuple(E_start[i]))
        train_E_end.append(sparse_to_tuple(E_end[i]))
    val_E_start = test_E_start = train_E_start
    val_E_end = test_E_end = train_E_end

    # Feed_dicts for validation and test set stay constant over different update steps
    train_feed_dict = construct_feed_dict(
        placeholders, u_features, v_features, u_features_nonzero,
        v_features_nonzero, train_support, train_support_t, train_labels,
        train_u_indices, train_v_indices, class_values, DO,
        train_u_features_side, train_v_features_side, train_E_start,
        train_E_end)

    # No dropout for validation and test runs. DO = dropout. input for val and test is same u_features and v_features.
    val_feed_dict = construct_feed_dict(
        placeholders, u_features, v_features, u_features_nonzero,
        v_features_nonzero, val_support, val_support_t, val_labels,
        val_u_indices, val_v_indices, class_values, 0., val_u_features_side,
        val_v_features_side, val_E_start, val_E_end)

    test_feed_dict = construct_feed_dict(
        placeholders, u_features, v_features, u_features_nonzero,
        v_features_nonzero, test_support, test_support_t, test_labels,
        test_u_indices, test_v_indices, class_values, 0., test_u_features_side,
        test_v_features_side, test_E_start, test_E_end)

    # Collect all variables to be logged into summary
    merged_summary = tf.summary.merge_all()

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    if WRITESUMMARY:
        train_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/train',
                                                     sess.graph)
        val_summary_writer = tf.summary.FileWriter(SUMMARIESDIR + '/val')
    else:
        train_summary_writer = None
        val_summary_writer = None

    best_val_score = np.inf
    best_val_loss = np.inf
    best_epoch = 0
    wait = 0

    print('Training...')

    #### COUTNING PARAMS
    total_parameters = 0
    for variable in tf.trainable_variables():
        # shape is an array of tf.Dimension
        shape = variable.get_shape()
        variable_parameters = 1
        for dim in shape:
            variable_parameters *= dim.value
        total_parameters += variable_parameters
    print('Total params: {}'.format(total_parameters))

    # FOR A VARIABLE LEARNING RATE
    assign_placeholder = tf.placeholder(tf.float32)
    assign_op = model.learning_rate.assign(assign_placeholder)
    old_loss = float('inf')
    # print('Original learning rate is {}'.format(sess.run(model.optimizer._lr)))

    train_rmses, val_rmses, train_losses, val_losses = [], [], [], []
    for epoch in tqdm(range(NB_EPOCH)):
        t = time.time()
        # Run single weight update
        # outs = sess.run([model.opt_op, model.loss, model.rmse], feed_dict=train_feed_dict)
        # with exponential moving averages
        outs = sess.run([model.training_op, model.loss, model.rmse],
                        feed_dict=train_feed_dict)

        train_avg_loss = outs[1]
        train_rmse = outs[2]

        val_avg_loss, val_rmse = sess.run([model.loss, model.rmse],
                                          feed_dict=val_feed_dict)

        # if train_avg_loss > 0.999*old_loss:
        # 	consecutive += 1
        # 	if consecutive >= consecutive_threshold:
        # 		LR /= decay_rate
        # 		sess.run(assign_op, feed_dict={assign_placeholder: LR})
        # 		print('New learning rate is {}'.format(sess.run(model.optimizer._lr)))
        # 		consecutive = 0
        # else:
        # 	consecutive = 0
        # old_loss = train_avg_loss

        train_rmses.append(train_rmse)
        val_rmses.append(val_rmse)
        train_losses.append(train_avg_loss)
        val_losses.append(val_avg_loss)

        if VERBOSE:
            print("[*] Epoch:", '%04d' % (epoch + 1), "train_loss=",
                  "{:.5f}".format(train_avg_loss), "train_rmse=",
                  "{:.5f}".format(train_rmse), "val_loss=",
                  "{:.5f}".format(val_avg_loss), "val_rmse=",
                  "{:.5f}".format(val_rmse), "\t\ttime=",
                  "{:.5f}".format(time.time() - t))

        if val_rmse < best_val_score:
            best_val_score = val_rmse
            best_epoch = epoch

        if epoch % 20 == 0 and WRITESUMMARY:
            # Train set summary
            summary = sess.run(merged_summary, feed_dict=train_feed_dict)
            train_summary_writer.add_summary(summary, epoch)
            train_summary_writer.flush()

            # Validation set summary
            summary = sess.run(merged_summary, feed_dict=val_feed_dict)
            val_summary_writer.add_summary(summary, epoch)
            val_summary_writer.flush()

        if epoch % 100 == 0 and epoch > 1000 and not TESTING and False:
            saver = tf.train.Saver()
            save_path = saver.save(sess,
                                   "tmp/%s_seed%d.ckpt" %
                                   (model.name, DATASEED),
                                   global_step=model.global_step)

            # load polyak averages
            variables_to_restore = model.variable_averages.variables_to_restore(
            )
            saver = tf.train.Saver(variables_to_restore)
            saver.restore(sess, save_path)

            val_avg_loss, val_rmse = sess.run([model.loss, model.rmse],
                                              feed_dict=val_feed_dict)

            print('polyak val loss = ', val_avg_loss)
            print('polyak val rmse = ', val_rmse)

            # Load back normal variables
            saver = tf.train.Saver()
            saver.restore(sess, save_path)

    # store model including exponential moving averages
    saver = tf.train.Saver()
    save_path = saver.save(sess,
                           "tmp/%s.ckpt" % model.name,
                           global_step=model.global_step)

    if VERBOSE:
        print("\nOptimization Finished!")
        print('best validation score =', best_val_score, 'at iteration',
              best_epoch)

    if TESTING:
        test_avg_loss, test_rmse = sess.run([model.loss, model.rmse],
                                            feed_dict=test_feed_dict)
        print('test loss = ', test_avg_loss)
        print('test rmse = ', test_rmse)

        # restore with polyak averages of parameters
        variables_to_restore = model.variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        saver.restore(sess, save_path)

        test_avg_loss, test_rmse = sess.run([model.loss, model.rmse],
                                            feed_dict=test_feed_dict)
        print('polyak test loss = ', test_avg_loss)
        print('polyak test rmse = ', test_rmse)

        sess.close()
        tf.reset_default_graph()
        return train_rmses, val_rmses, train_losses, val_losses, test_rmse
    else:
        # restore with polyak averages of parameters
        variables_to_restore = model.variable_averages.variables_to_restore()
        saver = tf.train.Saver(variables_to_restore)
        saver.restore(sess, save_path)

        val_avg_loss, val_rmse = sess.run([model.loss, model.rmse],
                                          feed_dict=val_feed_dict)
        print('polyak val loss = ', val_avg_loss)
        print('polyak val rmse = ', val_rmse)

        sess.close()
        tf.reset_default_graph()
        return train_rmses, val_rmses, train_losses, val_losses, val_rmse

コード例 #4

ファイルを表示

ファイル: graph_feature.py プロジェクト: Anak2016/disease_node_classification

def create_features_graph(adj, features, labels):
    '''
        creating a features_graph by add feature_node from node_feature to the graph in which node belongs to .

    :param adj: coo_matrix with dimension n*n where n = number of total nodes (without features node)
    :param features: coo_matrix with dimension n*d where d = feature dim
    :param labels: numpy array
    :return: adj of features_node_graph
    '''
    '''
    step
        >assign index to features with value of 1 (coo_matrix) 
        >add coordinate to adj (coo_matrix)
    '''
    # -- convert node_feature
    adj_dim = adj.shape[0] # use this as a start index of features_row
    features_dim = features.shape[1]
    # display2screen(features_dim)

    # # -- feature to feature edges
    # # --option 1 feature nodes have no self loops
    # option = "feature_no_self_loop"
    # ff_row = [i + adj_dim for i in range(features_dim)]
    # ff_col  = [i + adj_dim for i in range(features_dim)]
    # ff_val = [0 for i in range(features_dim)]

    # # -- option 2 features nodes have self loop
    option = 'features_with_self_loop'
    identity_mx = sp.eye(adj.shape[0]).tocoo()
    ff_row, ff_col, ff_val = identity_mx.row.tolist(), identity_mx.col.tolist(), identity_mx.data.tolist()

    # # -- option 3 all features are connected as clique
    # option = 'features_clique'
    # ones_max = np.ones(adj.shape[0])
    # ones_max = sp.coo_matrix(ones_max)
    # ff_row, ff_col, ff_val = ones_max.row.tolist(), ones_max.col.tolist(), ones_max.data.tolist()


    # ff_row, ff_col, ff_val = identity_mx.row, identity_mx.col, identity_mx.data

    # node to feature edges
    features_ind = [i + adj_dim for i in range(features_dim)] # dim = feature_dim
    nodes_ind = [i for i in range(adj_dim)] # dim = adj_dim

    nf_row = []
    nf_col = []
    nf_val = []

    # todo create unormalized_fetaures_graph.npy
    # file_name = f"adj_features.npy"

    file_name = f"adj_features_graph_normalized_node_option={option}.npy"
    # file_name = "adj_unormalized_features_graph.npy"
    file_path = f'data/preprocessing/{file_name}'
    if os.path.exists(file_path):
        # -- load pre_processed numpy from file_path
        s = time.time()
        adj = np.load(file_path)
        f = time.time()
        total = f-s
        print(f"total time = {total}")
        node_features_row, node_features_col = sp.find(sp.csr_matrix(adj))[0], sp.find(sp.csr_matrix(adj))[1]
    else:

        print("converted node features to features graph...")
        s = time.time()
        for row in nodes_ind:

            '''
            csr_matrix.nonzero()
                eg (array([0, 0, 1, 2, 2]), array([0, 1, 2, 0, 2]))
            '''
            # -- non_zero_feature of the current cow
            non_zero_ind = [ features.nonzero()[1][i] + adj_dim for i,j in enumerate(features.nonzero()[0]) if j == row]
            nf_row += [row for i in range(len(non_zero_ind))]
            nf_col += non_zero_ind

            # -- val = 1 if node has feature in it otherwise 0
            nf_val += [1  for i in features_ind if i in non_zero_ind] # dim = adj_dim
        f = time.time()
        total = f-s
        print(f"time ={total}")

        # --normalized nn_val
        adj = preprocessing.normalize_features(adj)

        nn_row = sp.find(adj)[0].tolist()
        nn_col = sp.find(adj)[1].tolist()
        nn_val = sp.find(adj)[2].tolist()

        # --create adj of node_featurse_graph
        node_features_row = ff_row + nf_row + nf_col + nn_row
        node_features_col = ff_col + nf_col + nf_row + nn_col
        node_features_val = ff_val + nf_val + nf_val + nn_val

        assert len(node_features_val) == len(node_features_row) == len(node_features_col), f"{len(node_features_val)} == {len(node_features_row)} == {len(node_features_col)}"
        node_features_graph = sp.csr_matrix((node_features_val, (node_features_row, node_features_col)))

        # -- edges
        # edges = np.array([[i, j] for i, j in zip(node_features_row, node_features_col)]).T  # dim (2, #edges)
        # -- for readability
        adj = node_features_graph.todense()

        # -- save to file_path
        np.save(file_path, adj)

    # display2screen('line 117')
    max_labels = np.amax(labels)
    non_class_label = max_labels+1

    labels = labels.tolist() + [non_class_label for i in range(adj.shape[0] - labels.flatten().shape[0])]

    # -- convert to numpy array
    edges = np.array([[i, j] for i, j in zip(node_features_row, node_features_col)]).T  # dim (2, #edges)
    edges = edges.astype(int)

    labels = np.array(labels)

    # display2screen(edges.shape)
    return adj, edges, labels

コード例 #5

ファイルを表示

ファイル: graph_feature.py プロジェクト: Anak2016/disease_node_classification

    # #   >> interestin observation:
    # #       :accuracy converge very slowly at 550 to 71 percent accuracy at with option 1.1 normalized adj
    # #       :accuracy converge very quickly at 61-61 percent with option 1.2 unnormalized adj
    # # tmp = adj
    # adj = normalize_features(sp.csr_matrix(adj.astype(float))) # --option 1.1 normalize
    # # adj = adj  # option 1.2 unnormalize
    # # display2screen(tmp[np.nonzero(x)[0][0], np.nonzero(x)[1][0]], x[np.nonzero(x)[0][0], np.nonzero(x)[1][0]])
    # adj = torch.tensor(adj)
    # x = adj
    # edge_index = torch.tensor(edge_index, dtype=torch.int32)
    # y = torch.tensor(y, dtype=torch.long)

    # -- option 2 => dim = (n+f) * (n+f) ;identity matrix
    #   >> test accuracy converge at aroung 61-62 percent
    x = np.identity(adj.shape[0])
    x = preprocessing.normalize_features(sp.csr_matrix(x))
    x = torch.tensor(x, dtype=torch.long)
    edge_index = torch.tensor(edge_index, dtype=torch.int32)
    y = torch.tensor(y, dtype=torch.long)


    # -- option 3 => dim = (n+f) * f;
    #       >>very bad accuracy at epoch 200, but it seems that its performance has not yet fully converge
    # features = np.concatenate((features.numpy(), np.identity(features.shape[1]))) # -- option 3.1
    # # features = np.concatenate((features.numpy(),  0 * np.identity(features.shape[1]))).astype(float) # -- option 3.2
    # # display2screen(np.amax(np.sum(features, axis=1)))
    # # tmp = features
    # features = normalize_features(sp.csr_matrix(features))
    # # display2screen(tmp[np.nonzero(x)[0][0], np.nonzero(x)[1][0]], x[np.nonzero(x)[0][0], np.nonzero(x)[1][0]])
    # features = torch.tensor(features, dtype=torch.long)
    # x = features

コード例 #6

ファイルを表示

def run_gcn_on_disease_graph(config, emb_name):
    '''
    Frame the problem by connect subgraph that has shared nodes
        ie. diseases that share node will be connected by an edges
    :param config:
    :return:
    '''
    # -- input arguments
    copd = config["data"]
    input = config["input"] # {disease_idx1: [[0,0,0,1,0,0], ....], disease_idx2: [...],... }
    y = config['label']
    train_mask = config['train_mask']
    test_mask = config['test_mask']
    emb = config['emb']
    hidden_sizes = config['hidden_layers']
    epochs = config['epochs']
    args = config['args']
    param = config['param']

    len_nodes = len(input.keys()) # amount of all node
    train_label = y[train_mask]
    test_label = y[test_mask]
    train_onehot = []
    test_onehot = []
    train_key = []
    test_key = []

    # -- convert onehot input into the following format
    # from
    #   {disease_idx1: [[0,0,0,1,0,0],[0,1,0,0,0,0] ....], disease_idx2: [...],... }
    # to
    #   {disease_idx1: [0,1,0,1,0,0], disease_idx2: [...],... }
    for key, val in input.items():
        sum = 0
        if int(key) in train_mask:
            for v in val:
                sum = np.add(sum, v)
            input[key] = sum
            train_onehot.append(input[key])
            train_key.append(key)
        sum1 = 0
        if int(key) in test_mask:
            for v in val:
                sum1 = np.add(sum1, v)
            input[key] = sum1
            test_onehot.append(input[key])
            test_key.append(key)

    # -- normalize feature
    train_input = preprocessing.normalize_features(csr_matrix(np.array(train_onehot)))
    test_input = preprocessing.normalize_features(csr_matrix(np.array(test_onehot)))

    # -- edge_index for disease_graph
    #   1. find overlap value between each disease
    edge_index = []

    # the higher the threshold, the most overfit to training set it is.
    # This is because in there will noly have edges to node that have edge sto more genes.
    th = int(args.th) # default = 100
    for d_out, k_out in zip(test_input, test_key):
        for d_in, k_in in zip(test_input, test_key):
            x = d_out - d_in
            x = x[x!=0]
            if x.shape[1] > th:
                if [k_out, k_in] not in edge_index and [k_in, k_out] not in edge_index:
                    # print(f"form edges between {k_out} and {k_in}")
                    edge_index.append([k_out, k_in])

    for d_out, k_out in zip(train_input, train_key):
        for d_in, k_in in zip(train_input, train_key):
            x = d_out - d_in
            x = x[x!=0]
            if x.shape[1] > th:
                if [k_out, k_in] not in edge_index and [k_in, k_out] not in edge_index:
                    # print(f"form edges between {k_out} and {k_in}")
                    edge_index.append([k_out, k_in])

    import math
    sparsity =  len(edge_index)/ math.factorial(len_nodes)

    print(f"num_edges = {len(edge_index)}")
    print(f"edges sparsity = {sparsity}" )

    edge_index = np.array(edge_index).T
    # display2screen(edge_index.shape, np.amax(edge_index.flatten()))

    # -- create train_input
    if emb_name != 'no_feat':
        train_input = emb[train_mask]
        test_input = emb[test_mask]
    else:
        train_input = train_input
        test_input = test_input

    # -- convert to tensor
    train_input = torch.tensor(train_input, dtype=torch.float)
    test_input = torch.tensor(test_input, dtype=torch.float)
    train_label = torch.tensor(train_label, dtype=torch.long)
    test_label = torch.tensor(test_label, dtype=torch.long)
    edge_index = torch.tensor(edge_index, dtype=torch.long)
    weighted_class = torch.tensor(args.weighted_class, dtype=torch.float)

    x = torch.cat((train_input,test_input), 0)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # import torch_geometric
    from torch_geometric.nn import GCNConv, ChebConv, GATConv, SAGEConv


    class Net(torch.nn.Module):
        def __init__(self):
            super(Net, self).__init__()

            modules = {
                # "conv1": GCNConv(20, args.hidden, cached=True),
                "conv1": GCNConv(train_input.shape[1], args.hidden, cached=True),
                "conv2": GCNConv(args.hidden, len(copd.labels2idx().keys()), cached=True)
            }

            for name, module in modules.items():
                self.add_module(name, module)

        def forward(self, x, edge_index):

            x = F.relu(self.conv1(x, edge_index))
            x = F.dropout(x, p=args.dropout, training=self.training)
            x = self.conv2(x, edge_index)
            return F.log_softmax(x, dim=1)

    gcn = Net().to(device)
    optimizer = torch.optim.Adam(gcn.parameters(), lr=args.lr, weight_decay=args.weight_decay)

    def unlabeled_weight(epochs):
        alpha = 0.0
        if epochs > param['T1']:
            if epochs > param['T2']:
                alpha = param['af']
            else:
                alpha = (epochs - param['T1']) / (param['T2'] - param['T1'] * param['af'])
        return alpha

    def train():
        gcn.train()
        optimizer.zero_grad()

        if args.pseudo_label_topk:

            labeled_loss = F.nll_loss(gcn(x, edge_index)[train_mask], train_label,
                                     weight=torch.tensor(list(map(int, args.weighted_class)), dtype=torch.float),
                                     reduction="mean")

            # -- labeled top k most confidence node to be pseduo_labels
            pseudo_label_pred = gcn(x, edge_index).max(1)[1]

            tmp = gcn(x, edge_index).max(1)[1].detach().flatten().tolist()
            tmp = [(l, i) for i, l in enumerate(tmp)]
            tmp = sorted(tmp, key=lambda x: x[0], reverse=True)  # rank label by predicted confidence value

            ranked_labels = [(l, i) for (l, i) in tmp]
            top_k_tuple = []

            for (l, i) in ranked_labels:
                if len(top_k_tuple) >= int(args.topk):
                    break

                top_k_tuple.append((i, l))  # get index of top_k to be masked during loss
            if len(top_k_tuple) >0:
                top_k = [t[0] for t in top_k_tuple]

                # -- add top_k to labeld_loss
                pseudo_label_loss = F.nll_loss(gcn(x, edge_index)[top_k], pseudo_label_pred[top_k], weight=weighted_class,
                                            reduction='mean')
            else:
                pseudo_label_loss = 0

            loss_output = labeled_loss + pseudo_label_loss
        else:
            loss_output = F.nll_loss(gcn(x, edge_index)[train_mask], train_label,
                                 weight=torch.tensor(list(map(int, args.weighted_class)), dtype=torch.float),
                                 reduction="mean")

        loss_output.backward()
        optimizer.step()
        return loss_output.data

    def test():
        gcn.eval()
        train_pred = gcn(x, edge_index)[train_mask].max(1)[1]
        train_acc = train_pred.eq(train_label).sum().item() / train_mask.shape[0]

        test_pred = gcn(x, edge_index)[test_mask].max(1)[1]
        test_acc = test_pred.eq(test_label).sum().item() / test_mask.shape[0]

        return [train_acc, test_acc]

    train_acc_hist = []
    test_acc_hist = []
    loss_hist = []
    log_list = []
    for epoch in range(epochs):
        loss_epoch = train()
        train_acc, test_acc = test()
        logging = 'Epoch: {:03d}, Train: {:.4f}, Test: {:.4f}'.format(epoch, train_acc, test_acc)
        if args.verbose:
            print(logging)
        log_list.append(logging)
        loss_hist.append(loss_epoch)
        train_acc_hist.append(train_acc)
        test_acc_hist.append(test_acc)

    split = args.split
    # -- create dir for hyperparameter config if not already exists
    weighted_class = ''.join(list(map(str, args.weighted_class)))

    HP = f'lr={args.lr}_d={args.dropout}_wd={args.weight_decay}'
    folder = f"log/gene_disease/{args.time_stamp}/gcn_on_disease_graph/split={split}/{HP}/"

    import os
    if not os.path.exists(folder):
        os.makedirs(folder)

    # if args.add_features:
    if args.emb_name != "no_feat":
        feat_stat = "YES"
    else:
        feat_stat = "NO"

    if args.pseudo_label_all:
        pseudo_label_stat = "ALL"
    elif args.pseudo_label_topk:
        pseudo_label_stat = "TOP_K"
    elif args.pseudo_label_topk_with_replacement:
        pseudo_label_stat = "TOP_K_WITH_REPLACEMENT"
    else:
        pseudo_label_stat = "NONE"

    T_param = ','.join([str(param['T1']), str(param['T2'])])
    # -- creat directory if not yet created
    save_path = f'{folder}img/'
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    if args.plot_all is True:
        args.plot_loss = True
        args.plot_no_train = True
        args.plot_train = True

    if args.plot_loss:
        # ======================
        # == plot loss and acc vlaue
        # ======================
        plt.figure(1)
        # -- plot loss hist
        plt.subplot(211)
        plt.plot(range(len(loss_hist)), loss_hist)
        plt.ylabel("loss values")
        plt.title("loss history")

        # -- plot acc hist
        plt.subplot(212)
        plt.plot(range(len(train_acc_hist)), train_acc_hist)
        plt.plot(range(len(test_acc_hist)), test_acc_hist)
        plt.ylabel("accuracy values")
        plt.title("accuracy history")
        print(
            "writing to  " + save_path + f"LOSS_ACC_feat={feat_stat}_gene_thresh_hold={th}_wc=[{weighted_class}]_T=[{T_param}].png")
        plt.savefig(
            save_path + f'ACC_feat={feat_stat}_gene_thresh_hold={th}_wc=[{weighted_class}]_T=[{T_param}].png')
        plt.show()

    # --train_mask f1,precision,recall
    train_pred = gcn(x, edge_index)[train_mask].max(1)[1]
    train_f1 = f1_score(train_label, train_pred, average='micro')
    train_precision = precision_score(train_label, train_pred, average='micro')
    train_recall = recall_score(train_label, train_pred, average='micro')

    # -- test_mask f1,precision,recall
    test_pred = gcn(x, edge_index)[test_mask].max(1)[1]
    test_f1 = f1_score(test_label, test_pred, average='micro')
    test_precision = precision_score(test_label, test_pred, average='micro')
    test_recall = recall_score(test_label, test_pred, average='micro')

    if args.log:
        save_path = f'{folder}ACC_feat={feat_stat}_pseudo_label={pseudo_label_stat}_gene_thresh_hold={th}_wc={weighted_class}.txt'
        print(f"writing to {save_path}...")
        with open(save_path, 'w') as f:
            txt = '\n'.join(log_list)
            f.write(txt)

    if args.log:
        cm_train = confusion_matrix(gcn(x, edge_index)[train_mask].max(1)[1], train_label)
        cm_test = confusion_matrix(gcn(x, edge_index)[test_mask].max(1)[1], test_label)

        # formatter = {'float_kind': lambda x: "%.2f" % x})
        cm_train = np.array2string(cm_train)
        cm_test = np.array2string(cm_test)

        save_path = f'{folder}CM_feat={feat_stat}_pseudo_label={pseudo_label_stat}_gene_thresh_hold={th}_wc={weighted_class}.txt'
        print(f"writing to {save_path}...")

        # txt = 'class int_rep is [' + ','.join(list(map(str, np.unique(data.y.numpy()).tolist()))) + ']'
        txt = 'class int_rep is [' + ','.join([str(i) for i in range(len(copd.labels2idx().values()))]) + ']'
        txt = txt + '\n\n' + "training cm" + '\n' + cm_train + '\n' \
              + f"training_accuracy ={log_list[-1].split(',')[1]}" + '\n' \
              + f"training_f1       ={train_f1}" + '\n' \
              + f"training_precision={train_precision}" + '\n' \
              + f"training_recall   ={train_recall}" + '\n'

        txt = txt + '\n\n' + "test cm" + '\n' + cm_test + '\n' \
              + f"test_accuracy ={log_list[-1].split(',')[2]}" + '\n' \
              + f"test_f1       ={test_f1}" + '\n' \
              + f"test_precision={test_precision}" + '\n' \
              + f"test_recall   ={test_recall}" + '\n'

        with open(save_path, 'w') as f:
            f.write(txt)