def __init__(self, params, mode, device): assert mode in ['train', 'test', 'valid'] np.random.seed(params['seed']) #self._const = 0 # constrain counter self._device = device self._n_data = params['n_data'] self._benchmark = params['benchmark'] self._batchsize = params['batch_size'] self._violated_const_ratio = params[ 'violated_const_ratio'] if mode == 'train' else 0 # builds ad hoc dataset, the number of violated_ constraints can be tuned (X, y) = util.build_dataset(self._benchmark, self._n_data, self._violated_const_ratio, params['seed']) self._n_var = len(X[0]) indices = self._get_indexes(params, self._n_data, mode, params['seed']) X, y = X[indices], y[indices] self._dataset = tuple([X, y])
def main(): parser = get_parser() args = parser.parse_args() setup_seed(args.seed) device = 'cuda:' + str(args.device) train_loader, test_loader = build_dataset(args) train_accuracies = [] test_accuracies = [] class_num = 10 if args.dataset == 'cifar10' else 100 t_net = { 'resnet18': resnet18, 'resnet34': resnet34, 'resnet56': resnet56, 'resnet110': resnet110 }[args.t_model](class_num) t_ckpt_name = 'SGD-CIFAR' + str(class_num) + '-' + args.t_model if args.dataset == 'cifar10': path = '../ckpt/checkpoint/cifar10/' + t_ckpt_name else: path = '../ckpt/checkpoint/cifar100/' + t_ckpt_name ckpt = torch.load(path, map_location=device) t_net.load_state_dict(ckpt['net']) t_net = t_net.to(device) s_ckpt_name = 'SGD-CIFAR' + str( class_num) + '-' + args.s_model + '-student' + '-overhaul2' s_net = { 'resnet18': resnet18, 'resnet20': resnet20, 'resnet34': resnet34, 'resnet56': resnet56, 'resnet110': resnet110 }[args.s_model](class_num) s_net = s_net.to(device) optimizer = optim.SGD(s_net.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss() d_net = distillation.Distiller(t_net, s_net) start_epoch = 0 best_acc = 0 start = time.time() for epoch in range(start_epoch, 150): if epoch in [80, 120]: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 train_acc = train_with_distill(d_net, optimizer, device, train_loader, criterion) test_acc = test(s_net, device, test_loader, criterion) end = time.time() print('epoch %d, train %.3f, test %.3f, time %.3fs' % (epoch, train_acc, test_acc, end - start)) start = time.time() # Save checkpoint. if best_acc < test_acc: best_acc = test_acc if epoch > 80: state = { 'net': s_net.state_dict(), } if not os.path.isdir('../ckpt/checkpoint'): os.mkdir('../ckpt/checkpoint') if args.dataset == 'cifar10': if not os.path.isdir('../ckpt/checkpoint/cifar10'): os.mkdir('../ckpt/checkpoint/cifar10') torch.save( state, os.path.join('../ckpt/checkpoint/cifar10', s_ckpt_name)) elif args.dataset == 'cifar100': if not os.path.isdir('../ckpt/checkpoint/cifar100'): os.mkdir('../ckpt/checkpoint/cifar100') torch.save( state, os.path.join('../ckpt/checkpoint/cifar100', s_ckpt_name)) print('best_acc %.3f' % best_acc) train_accuracies.append(train_acc) test_accuracies.append(test_acc) if not os.path.isdir('../ckpt/curve'): os.mkdir('../ckpt/curve') if args.dataset == 'cifar10': if not os.path.isdir('../ckpt/curve/cifar10'): os.mkdir('../ckpt/curve/cifar10') torch.save( { 'train_acc': train_accuracies, 'test_acc': test_accuracies }, os.path.join('../ckpt/curve/cifar10', s_ckpt_name)) elif args.dataset == 'cifar100': if not os.path.isdir('../ckpt/curve/cifar100'): os.mkdir('../ckpt/curve/cifar100') torch.save( { 'train_acc': train_accuracies, 'test_acc': test_accuracies }, os.path.join('../ckpt/curve/cifar100', s_ckpt_name))
eta = 1 types = ["bernoulli", "multinomial"] mode = "bin_class" rseed = 20 num_samples = 10000 DEBUG = False num_iter = 4 if __name__ == '__main__': nb_classifier = nb.NaiveBayesText(debug_mode=False, perc=2 / 3, rand_seed=rseed) dataset = build_dataset(dataset_path, family_labels_path, feature_list, num_samples, eta, rseed, mode, True, False) print("**** Malware Detection ****") avg_perf = { t: { "accuracy": 0, "precision": 0, "recall": 0, "f1-score": 0 } for t in types } for n in range(num_iter): print("TEST #{}".format(n + 1))
def sort(): ### Part 3: sort files into associated landsat-modis pairs ### dir, index = util.build_dataset(output_dir=os.environ['LS_MD_PAIRS'], stacked_bands=[1, 2, 3, 4, 5, 6, 7]) return dir, index
if __name__ == "__main__": data_set = "data" embedding = "embedding_SougouNews.npz" model_name = args.model model = import_module("models." + model_name) # 导入model config = model.Config(data_set, embedding) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed_all(1) torch.backends.cudnn.deterministic = True # 保证每次结果一样 # 加载数据 start_time = time.time() print("start loading data...\n") vocab, train_data, dev_data, test_data = build_dataset( config, args.word) config.n_vocab = len(vocab) train_iter = build_iterator(train_data, config) dev_iter = build_iterator(dev_data, config) test_iter = build_iterator(test_data, config) time_dif = get_time_dif(start_time) print("\nload data time usage : ", time_dif) # 开始训练 start_time = time.time() model = model.Model(config).to(config.device) print("------\nmodel : {0}\n".format(model_name)) print(model.parameters) print("------\ntraining...\n") train(config, model, train_iter, dev_iter, test_iter)
def train(args): from_dtm = 2018100100 to_dtm = 2019030100 all_articles = [] seen_seq = [] for path, _ in tqdm(iterate_data_files(from_dtm, to_dtm), mininterval=1): for line in open(path): l = line.strip().split() user = l[0] seen = l[1:] if len(seen) > 1: seen_seq.append(seen) all_articles += seen vocabulary_size = len(set(all_articles)) new_seen, count, article2idx_map, idx2article_map = \ build_dataset(all_articles, seen_seq.copy(), vocabulary_size, min_count=args.min_count, skip_window=args.skip_window) filtered_vocabulary_size = len(article2idx_map) print('Most common words', count[:5]) print("# of sentences : all ({}) -> filtered ({})".format( len(seen_seq), len(new_seen))) print("# of vocabulary : all ({}) -> filtered ({})".format( vocabulary_size, filtered_vocabulary_size)) # Reduce momory del all_articles del seen_seq span = 2 * args.skip_window + 1 # [ skip_window target skip_window ] buffer = deque(maxlen=span) # pylint: disable=redefined-builtin skip_dummy = ['UNK'] * args.skip_window all_targets = [] all_labels = [] for sen_idx, sentence in tqdm(enumerate(new_seen), total=len(new_seen)): sentence = skip_dummy + sentence + skip_dummy buffer.extend(sentence[0:span - 1]) for doc in sentence[span - 1:]: buffer.append(doc) if buffer[args.skip_window] != 'UNK': context_words = [ w for w in range(span) if w != args.skip_window and buffer[w] != 'UNK' ] _num_sample = len(context_words) if len( context_words) < args.num_skips else args.num_skips words_to_use = random.sample(context_words, _num_sample) for j, context_word in enumerate(words_to_use): all_targets.append( article2idx_map[buffer[args.skip_window]]) all_labels.append(article2idx_map[buffer[context_word]]) t1 = time() print("Shuffling indexes...") idxes = [e for e in range(len(all_targets))] random.shuffle(idxes) all_targets = np.array(all_targets)[idxes] all_labels = np.array(all_labels)[idxes] del idxes t2 = time() print("Shuffling finished [{:.1f} s]".format(t2 - t1)) config = {} config['batch_size'] = args.batch_size config['embedding_size'] = args.embedding_size config['skip_window'] = args.skip_window config['num_skips'] = args.num_skips config['num_sampled'] = args.num_sampled config['filtered_vocaulary_size'] = filtered_vocabulary_size sess = tf.Session() net = word2vec(sess, config) net.build_model() net.initialize_variables() decay_alpha = (args.alpha - args.min_alpha) / args.num_steps alpha = args.alpha check_step = 10000 save_step = 100000 average_loss = 0 t1 = time() for step in range(args.num_steps): batch_inputs, batch_labels = generate_batch(args.batch_size, all_targets, all_labels) loss_val = net.train(batch_inputs, batch_labels, alpha=alpha) alpha -= decay_alpha average_loss += loss_val if step % check_step == 0 and step > 0: average_loss /= check_step t2 = time() print("Average loss at step {}: {:.5} [{:.1f} s]".format( step, average_loss, t2 - t1)) t1 = t2 average_loss = 0 if (step % save_step == 0 and step > 0) or step + 1 == args.num_steps: print("Store checkpoints at step {}...".format(step)) net.store_checkpoint(step=step)
parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() with open("args.pickle", "wb") as f: pickle.dump(args, f) if not os.path.exists("saved_model"): os.mkdir("saved_model") print("Building dictionary...") word2index, index2word = build_word_dict("train", "data/train") question_max_len, answer_max_len = 50, 50 print("Loading training dataset...") qa_list = load_qa_list('data/train') train_x, train_y = build_dataset("train", qa_list, word2index, question_max_len, answer_max_len) with tf.Session() as sess: start = time.time() model = Model(index2word, question_max_len, answer_max_len, args) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(tf.global_variables()) batches = batch_iter(train_x, train_y, args.batch_size, args.num_epochs) num_batches_per_epoch = (len(train_x) - 1) // args.batch_size + 1 print("\nIteration starts.") print("Number of batches per epoch :", num_batches_per_epoch) for batch_x, batch_y in batches: train_feed_dict = get_feed_dict(model, word2index, answer_max_len, batch_x, batch_y)
import tensorflow as tf import pickle from model import Model from util import build_word_dict, build_dataset, batch_iter, load_qa_list, get_feed_dict with open("args.pickle", "rb") as f: args = pickle.load(f) word2index, index2word = build_word_dict("dev", "data/dev") question_max_len, answer_max_len = 50, 50 qa_list = load_qa_list('data/dev') dev_x, dev_y = build_dataset("dev", qa_list, word2index, question_max_len, answer_max_len) with tf.Session() as sess: print("Loading saved model...") model = Model(index2word, question_max_len, answer_max_len, args, forward_only=True) saver = tf.train.Saver(tf.global_variables()) ckpt = tf.train.get_checkpoint_state("./saved_model/") saver.restore(sess, ckpt.model_checkpoint_path) batches = batch_iter(dev_x, dev_y, args.batch_size, 1) print("Writing Answers to 'result.txt'...") for batch_x, batch_y in batches: batch_x_len = list( map(lambda x: len([xx for xx in x if xx != 0]), batch_x))
def main(): print("Start inference!") test_end_date = "20190314" """ 아래 파라미터는 실험에 의해 적합한 값을 고름 test_days_len: 테스트 날짜로부터 이후 날짜 수 ex) 20 additional_days_len: 테스트 날짜로부터 이전 날짜 수 ex) 4 테스트 시작 날짜로 부터 <앞뒤> 기간에 쓰여진 문서를 candidate doc 으로 사용 """ test_days_len = 20 additional_days_len = 4 candidates_len = test_days_len + additional_days_len users_dict = {} with codecs.open('./res/users.json', 'rU', 'utf-8') as f: for line in f: j_map = json.loads(line) users_dict[j_map['id']] = j_map cand_docs = {} t_obj = datetime.strptime(test_end_date, "%Y%m%d") doc_deadline_date = (t_obj + timedelta(days=1)).strftime("%Y%m%d") candidate_date = (t_obj - timedelta(days=candidates_len)).strftime("%Y%m%d") doc_deadline_date = int(doc_deadline_date) * 100 candidate_date = int(candidate_date) * 100 with codecs.open('./res/metadata.json', 'rU', 'utf-8') as f: for line in f: j_map = json.loads(line) # ts 를 datetime 으로 변경 j_map['time'] = ts2time(j_map['reg_ts']) # [test 기간 + test 이전 몇 일 기간] 동안의 doc 정보 저장 if j_map['time'] < doc_deadline_date and j_map[ 'time'] > candidate_date: cand_docs[j_map['id']] = j_map print("# of candidate articles from {} to {} : {}".format( candidate_date // 100, test_end_date, len(cand_docs))) # 20190221 부터 한 달간의 클릭 문서 분포를 파악 d_obj = datetime.strptime("20190221", "%Y%m%d") date_list = [] for i in range(30): date_list.append((d_obj - timedelta(days=i)).strftime("%Y%m%d")) dist_map = get_click_dist(date_list, test_days_len, additional_days_len) s_obj = datetime.strptime("20190222", "%Y%m%d") dist_sorted_map = sorted(dist_map.items(), key=lambda k: -k[1]) click_rank_per_date = [((s_obj + timedelta(days=e[0])).strftime("%Y%m%d"), rank) for rank, e in enumerate(dist_sorted_map)] click_rank_per_date = dict(click_rank_per_date) print(click_rank_per_date) # 후보 doc 들을 writer 로 묶어줌 cand_doc_writer = {} for doc_id, doc_info in cand_docs.items(): writer = doc_info['user_id'] cand_doc_writer[writer] = cand_doc_writer.get(writer, []) + [doc_id] for k, v in cand_doc_writer.items(): c_v = [(e, int(e.split("_")[1])) for e in v] cand_doc_writer[k] = [(e[0], int(cand_docs[e[0]]['time'])) for e in sorted(c_v, key=lambda v: v[1])] user_seen = {} user_latest_seen = {} user_last_seen = {} # w2v 에 쓰일 sequences seen_seq = [] all_articles = [] # test 의 (겹치는)기간 동안의 doc 사용량 doc_cnt = {} from_dtm = 2018100100 to_dtm = 2019030100 for path, _ in tqdm(iterate_data_files(from_dtm, to_dtm), mininterval=1): for line in open(path): l = line.strip().split() user = l[0] seen = l[1:] if len(seen) > 1: seen_seq.append(seen) all_articles += seen user_seen[user] = user_seen.get(user, []) + seen date_range = path.split("./res/read/")[1] fr = int(date_range.split("_")[0]) if fr >= 2019020100: user_latest_seen[user] = user_latest_seen.get(user, []) + seen if fr < 2019022200: user_last_seen[user] = user_last_seen.get(user, []) + [fr] if fr >= 2019022200: for doc in seen: doc_cnt[doc] = doc_cnt.get(doc, 0) + 1 for u, dates in user_last_seen.items(): user_last_seen[u] = max(dates) doc_cnt = OrderedDict(sorted(doc_cnt.items(), key=lambda k: -k[1])) pop_list = [k for k, v in doc_cnt.items()][:300] del doc_cnt # word2vec 에 이용하는 데이터 만들기 vocabulary_size = len(set(all_articles)) _, _, article2idx_map, idx2article_map = \ build_dataset(all_articles, seen_seq.copy(), vocabulary_size, min_count=5, skip_window=4) filtered_vocabulary_size = len(article2idx_map) del all_articles del seen_seq print("# of vocabulary : all ({}) -> filtered ({})".format( vocabulary_size, filtered_vocabulary_size)) batch_size = 128 embedding_size = 128 num_sampled = 10 config = {} config['batch_size'] = batch_size config['embedding_size'] = embedding_size config['num_sampled'] = num_sampled config['filtered_vocaulary_size'] = filtered_vocabulary_size # word2vec ckpt 불러오기 sess = tf.Session() net = word2vec(sess, config) net.build_model() net.initialize_variables() net.restore_from_checkpoint(ckpt_path="./ckpt/", step=500000, use_latest=True) user_most_seen = {} for u, seen in user_latest_seen.items(): for doc in seen: if doc.startswith("@"): writer = doc.split("_")[0] seen_map = user_most_seen.get(u, {}) seen_map[writer] = seen_map.get(writer, 0) + 1 user_most_seen[u] = seen_map if u in user_most_seen: user_most_seen[u] = dict([ e for e in sorted(user_most_seen[u].items(), key=lambda k: -k[1]) ]) #tmp_dev = ['./tmp/dev.users.recommend', './tmp/dev.users'] #dev = ['./res/predict/dev.recommend.txt', './res/predict/dev.users'] test = ['./res/predict/recommend.txt', './res/predict/test.users'] path_list = [test] for output_path, user_path in path_list: print("Start recommendation!") print("Read data from {}".format(user_path)) print("Write data to {}".format(output_path)) ## word2vec 에 의한 top_n 먼저 계산 articles_len = 4 positives = [] with codecs.open(user_path, mode='r') as f: for idx, line in enumerate(f): u = line.rsplit()[0] pos = [ article2idx_map[e] for e in reversed(user_seen.get(u, [])) if e in article2idx_map ][:articles_len] remain_len = articles_len - len(pos) pos += [filtered_vocabulary_size for _ in range(remain_len)] positives.append(np.array(pos)) _, _, top_n_bests = net.most_similar(positives, idx2article_map=idx2article_map, top_n=300) top_n_bests = np.array(top_n_bests)[:, :, 0] with codecs.open(output_path, mode='w') as w_f: with codecs.open(user_path, mode='r') as f: for idx, line in tqdm(enumerate(f)): u = line.rsplit()[0] user_most_seen_map = user_most_seen.get(u, {}) def rerank_doc(doc_list): """ rerank : 세 가지 방식으로 doc_list 로 들어온 문서들을 재정렬함 - 우선순위 1. 유저가 과거(user_latest_seen) 에 본 에디터의 글 횟 수 -> 많을수록 우선 - 우선순위 2. 해당 날짜에 만들어진 문서가 클릭될 확률 순위(click_rank_per_date) -> rank 작을 수록 우선 - 우선순위 3. 문서가 만들어진 최신 순 """ n_doc_list = [] for e in doc_list: if e[1] > user_last_seen.get(u, 0) and str( e[1] // 100) in click_rank_per_date: writer = e[0].split("_")[0] writer_hit_cnt = user_most_seen_map.get( writer, 0) n_doc_list.append( (e[0], e[1], click_rank_per_date[str(e[1] // 100)], writer_hit_cnt)) reranked_doc_list = [ e[0] for e in sorted(n_doc_list, key=lambda k: (-k[3], k[2], k[1])) ] return reranked_doc_list ### 추천은 아래 1 + 2 + 3 순서로 함 # 1. 구독한 에디터들의 글 중들을 candidate 에서 뽑기 following_list = users_dict.get( u, {'following_list': []})['following_list'] following_doc = [] if following_list: for e in following_list: following_doc += cand_doc_writer.get(e, []) following_doc = rerank_doc(following_doc) # 2. 유저가 많이 본 에디터의 글들을 candidate 에서 뽑기 most_seen_new_doc = [] if user_most_seen_map: for e, writer_cnt in user_most_seen_map.items(): # writer 가 3 번 이상 본 경우에만 활용 if writer_cnt >= 3: most_seen_new_doc += cand_doc_writer.get(e, []) most_seen_new_doc = rerank_doc(most_seen_new_doc) # 3. word2vec 모델에서 가장 최근에 본 n 개 문서와 가장 유사한 문서들을 뽑기 positive_input = [ article2idx_map[e] for e in reversed(user_seen.get(u, [])) if e in article2idx_map ][:articles_len] if positive_input: sim_list = list(top_n_bests[idx]) else: sim_list = pop_list # 최종 추천 (1 + 2 + 3) rec_docs = following_doc + most_seen_new_doc + sim_list rec_docs = list(OrderedDict.fromkeys(rec_docs)) # 이미 유저가 과거에 본 문서는 제거 n_rec_docs = [] for d in rec_docs: if d not in user_seen.get(u, []): n_rec_docs.append(d) if len(n_rec_docs) < 100: n_rec_docs = pop_list line = "{} {}\n".format(u, ' '.join(n_rec_docs[:100])) w_f.write(line) print("Finish!")