def main(): parser = argparse.ArgumentParser(description="Retrieve petitions from We The People") parser.add_argument( "-m", "--max", metavar="INTEGER", dest="max", type=int, default=None, help="maximum number of petitions to retrieve", ) parser.add_argument( "-s", "--start", metavar="INTEGER", dest="start", type=int, default=1, help="starting page, 20 per page, default is 1", ) args = parser.parse_args() if args.max is not None and args.max < 1: parser.error("How can I scrape less than one petition? You make no sense! --max must be one or greater.") if args.start < 1: parser.error("--start must be one or greater.") log("Found %i petitions" % (petitions(args.start, args.max))) # write log scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") write(json.dumps(scrapelog, indent=2), "log-wh-" + scrapelog["begin"] + ".json", log_dir())
def main(): parser = argparse.ArgumentParser(description="Retrieve petitions from We The People") parser.add_argument("-m", "--max", metavar="INTEGER", dest="max", type=int, default=None, help="maximum pages of petitions to retrieve, default is 10, 100 per page") parser.add_argument("-s", "--start", metavar="INTEGER", dest="start", type=int, default=1, help="starting page, 100 per page, default is 1") parser.add_argument("-q", "--query", metavar="STRING", dest="query", type=str, default="whitehouse+petition", help="The query for searching twitter for petition links, default is 'whitehouse+petition'") args = parser.parse_args() if args.max is not None and args.max < 1: parser.error("How can I scrape less than one pages of twitter results? You make no sense! --max must be one or greater.") if args.start < 1: parser.error("--start must be one or greater.") if not len(sys.argv) > 1: log('Running with default values. Use --h to see options.') search(args.query, args.start, args.max) #write log scrapelog["query"] = args.query scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") write(json.dumps(scrapelog, indent=2), "log-tw-" + scrapelog["begin"] + ".json", log_dir()) log("Done. Found total %i petitions" % (len(scrapelog["signatures"])))
def main(): parser = argparse.ArgumentParser( description="Retrieve petitions from We The People") parser.add_argument("-m", "--max", metavar="MAX", dest="max", type=int, default=None, help="maximum number of petitions to retrieve") parser.add_argument("-s", "--start", metavar="START", dest="start", type=int, default=1, help="starting page, 20 per page, default is 1") args = parser.parse_args() if args.max is not None and args.max < 1: parser.error( "How can I scrape less than one petition? You make no sense! --max must be one or greater." ) if args.start < 1: parser.error("--start must be one or greater.") log("Found %i petitions" % (petitions(args.start, args.max))) #write log scrapelog["end"] = datetime.now().strftime("%Y-%m-%d-%H:%M:%S") write(json.dumps(scrapelog, indent=2), "log-wh-" + scrapelog["begin"] + ".json", log_dir())
def main(): # loads a lot of default parser values from the 'parser' file parser = get_parser() # get args from parser as an object args = parser.parse_args() args.device = 'cuda' if args.cuda else 'cpu' # initialize seeds utils.init_seed(args.seed) # print('loader stuff', args) loader = Loader.IncrementalLoader(args, seed=args.seed) # print('loader stuff after after', args) n_inputs, n_outputs, n_tasks = loader.get_dataset_info() # setup logging # logging is from 'misc_utils.py' from 'utils' folder timestamp = utils.get_date_time() # this line is redundant bcz log_dir already takes care of it args.log_dir, args.tf_dir = utils.log_dir(args, timestamp) # stores args into "training_parameters.json" # create the model neural net model = Model.Net(n_inputs, n_outputs, n_tasks, args, innerlr=args.opt_lr, outerlr=args.alpha_init) # make model cuda-ized if possible model.net.to(args.device) # for all the CL baselines result_val_t, result_val_a, result_test_t, result_test_a, spent_time = life_experience(model, loader, args) # save results in files or print on terminal save_results(args, result_val_t, result_val_a, result_test_t, result_test_a, model, spent_time)
def train(train_data, test_data=None): features, label_map, \ train_nodes, valid_nodes, test_nodes, \ train_adj, train_weight_adj, train_column_adj, \ test_adj, test_weight_adj, test_column_adj = train_data # if isinstance(list(class_map.values())[0], list): # num_classes = len(list(class_map.values())[0]) # else: # num_classes = len(set(class_map.values())) num_classes = label_map.shape[1] feats_dim = features.shape[1] # 插入0行好像没什么用啊? if not features is None: # pad with dummy zero vector features = np.vstack([features, np.zeros((feats_dim, ))]) # 不晓得为啥要variable(constant(), trainable=False), 很奇怪 features_info = tf.Variable(tf.constant(features, dtype=tf.float32), trainable=False) #context_pairs = train_data[3] if FLAGS.random_context else None placeholders = construct_placeholders(num_classes, feats_dim) minibatch = NodeMinibatchIterator( placeholders, # features, # id_map, # weight_map, label_map, # weight_dict, supervised_info=[train_nodes, valid_nodes, test_nodes], batch_size=FLAGS.batch_size, max_degree=FLAGS.max_degree) # 注意!是placeholder, 且是全量的 # TODO shape 还有数据信息, (, train_adj.shape) adj_info_ph = tf.placeholder(tf.int32, shape=train_adj.shape) weight_adj_info_ph = tf.placeholder(tf.float32, shape=train_weight_adj.shape) column_adj_info_ph = tf.placeholder(tf.int32, shape=train_column_adj.shape) adj_info = tf.Variable(adj_info_ph, trainable=False, name="adj_info") weight_adj_info = tf.Variable(weight_adj_info_ph, trainable=False, name='weight_adj_info') column_adj_info = tf.Variable(column_adj_info_ph, trainable=False, name='column_adj_info') # 没有被完全赋值,只是赋值操作 train_adj_info = tf.assign(adj_info, train_adj) val_adj_info = tf.assign(adj_info, test_adj) train_weight_adj_info = tf.assign(weight_adj_info, train_weight_adj) val_weight_adj_info = tf.assign(weight_adj_info, test_weight_adj) train_column_adj_info = tf.assign(column_adj_info, train_column_adj) val_column_adj_info = tf.assign(column_adj_info, test_column_adj) # 采样 # TODO features 数据还是从这里进去了 # TODO 要拿出来啊啊啊啊啊 sampler = UniformNeighborSampler(features_info, adj_info, weight_adj_info, column_adj_info) # === build model === if FLAGS.model == 'graphsage_mean': # Create model sampler = UniformNeighborSampler(adj_info, weight_adj_info, column_adj_info) # 16, 8 if FLAGS.samples_3 != 0: layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2), SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2) ] elif FLAGS.samples_2 != 0: layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] else: layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos, concat=True, model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'gcn': # Create model sampler = UniformNeighborSampler(adj_info, weight_adj_info, column_adj_info) layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, 2 * FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, 2 * FLAGS.dim_2) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="gcn", model_size=FLAGS.model_size, concat=False, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'geniepath': sampler = UniformNeighborSampler(adj_info, weight_adj_info, column_adj_info) layer_infos = [ SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2) ] model = SupervisedGraphsage(num_classes, placeholders, features, adj_info, minibatch.deg, layer_infos=layer_infos, aggregator_type="geniepath", model_size=FLAGS.model_size, concat=False, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True) elif FLAGS.model == 'cross': # Create model # if FLAGS.samples_3 != 0: # layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), # SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2), # SAGEInfo("node", sampler, FLAGS.samples_3, FLAGS.dim_2)] # elif FLAGS.samples_2 != 0: # layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1), # SAGEInfo("node", sampler, FLAGS.samples_2, FLAGS.dim_2)] # else: # layer_infos = [SAGEInfo("node", sampler, FLAGS.samples_1, FLAGS.dim_1)] layer_infos = [ SAGEInfo("node", FLAGS.samples_1, FLAGS.dim_1), SAGEInfo("node", FLAGS.samples_2, FLAGS.dim_2) ] model = SupervisedGraphsage( placeholders, feats_dim, num_classes, sampler, # features, # adj_info, # variable # minibatch.deg, layer_infos=layer_infos, aggregator_type='cross', # 多了这一句 concat=True, model_size=FLAGS.model_size, sigmoid_loss=FLAGS.sigmoid, identity_dim=FLAGS.identity_dim, logging=True) else: raise Exception('Error: model name unrecognized.') config = tf.ConfigProto(log_device_placement=FLAGS.log_device_placement) config.gpu_options.allow_growth = True # config.gpu_options.per_process_gpu_memory_fraction = GPU_MEM_FRACTION config.allow_soft_placement = True # Initialize session sess = tf.Session(config=config) # merged = tf.summary.merge_all() # summary_writer = tf.summary.FileWriter(log_dir(), sess.graph) # Init variables sess.run(tf.global_variables_initializer(), feed_dict={ adj_info_ph: train_adj, weight_adj_info_ph: train_weight_adj, column_adj_info_ph: train_column_adj }) # === Train model === total_steps = 0 avg_time = 0.0 epoch_val_costs = [] for epoch in range(FLAGS.train_epochs): minibatch.shuffle() iter = 0 print('\n### Epoch: %04d ###' % (epoch + 1)) epoch_val_costs.append(0) while not minibatch.end(): # Construct feed dictionary feed_dict, labels = minibatch.next_minibatch_feed_dict( ) # 每一次都有全量的feet 进来 feed_dict.update({placeholders['dropout']: FLAGS.dropout}) # t = time.time() outs = sess.run([model.opt_op, model.loss, model.preds], feed_dict=feed_dict) # outs = sess.run([merged, model.opt_op, model.loss, model.preds], # feed_dict=feed_dict) # outs = sess.run([merged, model.loss, model.preds],feed_dict=feed_dict) # train_cost = outs[2] # if iter % FLAGS.validate_iter == 0: # # Validation # # do the assign operation # sess.run([val_adj_info.op, val_weight_adj_info.op, val_column_adj_info.op]) # # 如果有设置采样数量的话 # if FLAGS.validate_batch_size == -1: # val_cost, val_f1_mic, val_f1_mac,report, duration, _ = incremental_evaluate( # sess, model, minibatch, FLAGS.batch_size) # else: # val_cost, val_f1_mic, val_f1_mac, duration = evaluate( # sess, model, minibatch, FLAGS.validate_batch_size) # sess.run([train_adj_info.op, train_weight_adj_info.op, train_column_adj_info.op]) # epoch_val_costs[-1] += val_cost # if total_steps % FLAGS.print_every == 0: # summary_writer.add_summary(outs[0], total_steps) # Print results # avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) # print("train_time=", "{:.5f}".format(avg_time)) # if total_steps % FLAGS.print_every == 0: # train_f1_mic, train_f1_mac = calc_f1(labels, outs[-1]) # train_accuracy = calc_acc(labels,outs[-1]) # report = classification_report(labels,outs[-1]) # print("Iter:", '%04d' % iter, # "train_loss=", "{:.5f}".format(train_cost), # "train_f1_mic=", "{:.5f}".format(train_f1_mic), # "train_f1_mac=", "{:.5f}".format(train_f1_mac), # "val_loss=", "{:.5f}".format(val_cost), # "val_f1_mic=", "{:.5f}".format(val_f1_mic), # "val_f1_mac=", "{:.5f}".format(val_f1_mac), # "time=", "{:.5f}".format(avg_time)) #print(report) iter += 1 total_steps += 1 if total_steps > FLAGS.max_total_steps: break # when each epoch ends # show the F1 report if epoch % 1 == 0: # sess.run([val_adj_info.op, val_weight_adj_info.op, val_column_adj_info.op]) sess.run([val_adj_info, val_weight_adj_info, val_column_adj_info]) # val_cost, val_f1_mic, val_f1_mac, report, duration = incremental_evaluate( # sess, model, minibatch, FLAGS.batch_size) # area = my_incremental_evaluate( # sess, model, minibatch, FLAGS.batch_size) # # precision, recall, thresholds = precision_recall_curve( # # val_labels[:, 1], val_preds[:, 1]) # # area2 = auc(recall, precision) # print("Full validation stats:", # "loss=", "{:.5f}".format(val_cost), # "f1_micro=", "{:.5f}".format(val_f1_mic), # "f1_macro=", "{:.5f}".format(val_f1_mac), # "time=", "{:.5f}".format(duration)) # print(report) # print('AUC',area) test_cost, test_f1_mic, test_f1_mac, report, duration = incremental_evaluate( sess, model, minibatch, FLAGS.batch_size, test=True) area = my_incremental_evaluate(sess, model, minibatch, FLAGS.batch_size, test=True) # precision, recall, thresholds = precision_recall_curve( # test_labels[:, 1], test_preds[:, 1]) # area2 = auc(recall, precision) print("Full Test stats:", "loss=", "{:.5f}".format(test_cost), "f1_micro=", "{:.5f}".format(test_f1_mic), "f1_macro=", "{:.5f}".format(test_f1_mac), "time=", "{:.5f}".format(duration)) print(report) print('AUC', area) # once acu > 0.82, save the model if area > 0.83: model.save(sess) print('AUC gotcha! model saved.') # np.save('../data/'+FLAGS.model+'aggr'+'_precision',precision) # np.save('../data/'+FLAGS.model+'aggr'+'_recall',recall) # 应该设置下early stopping if total_steps > FLAGS.max_total_steps: break # model.save(sess) print("Optimization Finished!") sess.run([val_adj_info.op, val_weight_adj_info.op, val_column_adj_info.op]) # val_cost, val_f1_mic, val_f1_mac, report, duration, area = incremental_evaluate( # sess, model, minibatch, FLAGS.batch_size) # area = my_incremental_evaluate( # sess, model, minibatch, FLAGS.batch_size) # precision, recall, thresholds = precision_recall_curve( # val_labels[:, 1], val_preds[:, 1]) # area = auc(recall, precision) # np.save('../data/val_preds.npy', val_preds) # np.save('../data/val_labels.npy', val_labels) # np.save('../data/val_cost.npy', v_cost) # print("Full validation stats:", # "loss=", "{:.5f}".format(val_cost), # "f1_micro=", "{:.5f}".format(val_f1_mic), # "f1_macro=", "{:.5f}".format(val_f1_mac), # "time=", "{:.5f}".format(duration)) # print(report) # print('AUC', area) # with open(log_dir() + "val_stats.txt", "w") as fp: # fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f} time={:.5f}". # format(val_cost, val_f1_mic, val_f1_mac, duration)) test_cost, test_f1_mic, test_f1_mac, report, duration = incremental_evaluate( sess, model, minibatch, FLAGS.batch_size, test=True) area = my_incremental_evaluate(sess, model, minibatch, FLAGS.batch_size, test=True) # precision, recall, thresholds = precision_recall_curve( # test_labels[:, 1], test_preds[:, 1]) # area = auc(recall, precision) # np.save('../data/test_preds.npy', test_preds) # np.save('../data/test_labels.npy', test_labels) # np.save('../data/test_cost.npy', t_cost) # prevent from override print("Full Test stats:", "loss=", "{:.5f}".format(test_cost), "f1_micro=", "{:.5f}".format(test_f1_mic), "f1_macro=", "{:.5f}".format(test_f1_mac), "time=", "{:.5f}".format(duration)) print(report) print('AUC:', area) with open(log_dir() + "test_stats.txt", "w") as fp: fp.write("loss={:.5f} f1_micro={:.5f} f1_macro={:.5f}".format( test_cost, test_f1_mic, test_f1_mac))
def fit(self, X, y, n_epochs=100, X_valid=None, y_valid=None): """훈련 세트에 모델을 훈련시킵니다. X_valid와 y_valid가 주어지면 조기 종료를 적용합니다.""" self.close_session() tf.summary.merge_all() # 훈련 세트로부터 n_inputs와 n_outputs를 구합니다. n_inputs = X.shape[1] self.classes_ = np.unique(y) n_outputs = len(self.classes_) # 레이블 벡터를 정렬된 클래스 인덱스 벡터로 변환합니다. # 0부터 n_outputs - 1까지의 정수를 담고 있게 됩니다. # 예를 들어, y가 [8, 8, 9, 5, 7, 6, 6, 6]이면 # 정렬된 클래스 레이블(self.classes_)은 [5, 6, 7, 8, 9]가 되고 # 레이블 벡터는 [3, 3, 4, 0, 2, 1, 1, 1]로 변환됩니다. self.class_to_index_ = {label: index for index, label in enumerate(self.classes_)} y = np.array([self.class_to_index_[label] for label in y], dtype=np.int32) self._graph = tf.Graph() with self._graph.as_default(): self._build_graph(n_inputs, n_outputs) # 배치 정규화를 위한 추가 연산 extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) # 조기 종료를 위해 max_checks_without_progress = 20 checks_without_progress = 0 best_loss = np.infty best_params = None # 훈련 file_writer = tf.summary.FileWriter(log_dir("board_log"), graph=self._graph) self._session = tf.Session(graph=self._graph) with self._session.as_default() as sess: self._init.run() for epoch in range(n_epochs): rnd_idx = np.random.permutation(len(X)) for rnd_indices in np.array_split(rnd_idx, len(X) // self.batch_size): X_batch, y_batch = X[rnd_indices], y[rnd_indices] feed_dict = {self._X: X_batch, self._y: y_batch} if self._training is not None: feed_dict[self._training] = True sess.run(self._training_op, feed_dict=feed_dict) if extra_update_ops: sess.run(extra_update_ops, feed_dict=feed_dict) if X_valid is not None and y_valid is not None: loss_val, acc_val, loss_str, acc_str = sess.run([self._loss, self._accuracy, self._loss_str, self._acc_str], feed_dict={self._X: X_valid, self._y: y_valid}) if loss_val < best_loss: best_params = self._get_model_params() best_loss = loss_val checks_without_progress = 0 else: checks_without_progress += 1 print("{}\t검증 세트 손실: {:.6f}\t최선의 손실: {:.6f}\t정확도: {:.2f}%".format( epoch, loss_val, best_loss, acc_val * 100)) file_writer.add_summary(summary=acc_str, global_step=epoch) file_writer.add_summary(summary=loss_str, global_step=epoch) if checks_without_progress > max_checks_without_progress: print("조기 종료!") break else: loss_train, acc_train = sess.run([self._loss, self._accuracy], feed_dict={self._X: X_batch, self._y: y_batch}) print("{}\t마지막 훈련 배치 손실: {:.6f}\tAccuracy: {:.2f}%".format( epoch, loss_train, acc_train * 100)) # 조기 종료를 사용하면 이전의 최상의 모델로 되돌립니다. if best_params: self._restore_model_params(best_params) return self