def main(): data_utils = DataUtils() clf_utils = ClassifierUtils() decision_documents, decision_labels = data_utils.load_decision_data() disagreement_documents, disagreement_labels = data_utils.load_disagreement_data( ) clf_metadata = { 'type': 'RF', 'n_estimators': 500, 'max_depth': 128, 'n_jobs': 8 } features_metadata = { 'type': 'count', 'use_sw': True, 'use_length': False, 'binary': False, 'normalize': False, 'append_binary': False, 'sampling': None } metrics = clf_utils.cross_validate(disagreement_documents, disagreement_labels, clf_metadata, features_metadata, num_splits=5) embed()
def __init__(self, config=config_reader()): """ read model param """ self.rnn_mode = config['rnn_mode'] self.batch_size = config['batch_size'] self.embedding_dim = config['embedding_dim'] self.num_layers = config['num_layers'] self.num_units = config['num_utils'] self.FCNN_num_units = config['FCNN_num_units'] self.learning_rate = config['learning_rate'] self.max_epoch = config['max_epoch'] self.keep_prob = config['keep_prob'] self.model_path = config['model_path'] self.logs_file = config['logs_file'] self.end_loss = config['end_loss'] self.save_model_name = config['save_model_name'] self.print_step = config['print_step'] self.save_epoch = config['save_epoch'] self.data_utils = DataUtils() self.vocab = self.data_utils.vocab self.chunk_size = self.data_utils.chunk_size self.global_step = tf.Variable(0, trainable=False, name='global_step') self.increment_global_step_op = tf.assign(self.global_step, self.global_step + 1)
def load_data(self): self.du = DataUtils(self.config.training_file, self.config.testing_file, self.config.batch_size) self.X_train = self.du.train_images self.y_train = self.du.train_labels self.X_val = self.du.val_images self.y_val = self.du.val_labels self.X_test = self.du.test_images self.y_test = self.du.test_labels
def main(path, graphics): t = DataUtils(path) train, test = t.train, t.test for _t in train + test: inp, out = _t['input'], _t['output'] inp, out = np.asarray(inp), np.asarray(out) output_array = solve(inp, out, graphics) print(output_array)
def __init__(self): self.num_classes = 2 self.resnet50_weights = os.path.realpath('models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5') self.xception_weights = os.path.realpath('models/xception_weights_tf_dim_ordering_tf_kernels_notop.h5') self.model_output_path = os.path.realpath('data/model_output.h5') self.model_path = {'resnet50': os.path.realpath('data/model_resnet50.h5'), 'xception': os.path.realpath('data/model_xception.h5')} self.transfer_classifiers = {'resnet50': (ResNet50, self.resnet50_weights), 'xception': (Xception, self.xception_weights)} self.du = DataUtils()
def test(self, classifier, model=None): du = DataUtils() X_test, y_test = du.data_preprocess('test') pred = self.predict(X_test, classifier, model) y_pred = np.zeros(len(pred), dtype=int) y_pred[pred[:, 1] > pred[:, 0]] = 1 score = metrics.accuracy_score(y_test[:, 1], y_pred) logger_tc.info('test accuracy: %.3f' % score) with h5py.File(self.model_output_path) as model_output: if '%s_test_pred' % classifier not in model_output: model_output.create_dataset('%s_test_pred' % classifier, data=pred)
def normal_experiment(args): test_size = 0.3 du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=7014) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=7014) pos_train_, neg_train_ = get_dataset_with_noise(pos_train, neg_train, noise_rate=args.noise_rate) if args.name == 'member': beam_step = 3 N_beam = 3 elif args.name == 'subtree': beam_step = 3 N_beam = 15 else: beam_step = 5 N_beam = 10 N_max = 50 N = 1 ilp_train = ILPProblem(pos_train_, neg_train_, bk, lang, name=args.name) ilp_train.print() CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N(N=N, gen_mode='beam', N_max=N_max, T_beam=beam_step, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) mse = compute_mse(pos_test, neg_test, v_list[0], facts) auc = compute_auc(pos_test, neg_test, v_list[0], facts) print('====== TEST SCORE =======') print('Mean-squared test error: ', mse) print('AUC: ', auc)
def __init__(self, reports_directory, src_server, src_index, src_type): self.data_loader_utils_dest = DataLoaderUtils(src_server, src_index, src_type) self.reports_directory = reports_directory self.src_server = src_server self.src_index = src_index self.src_type = src_type self.delete_tags = True self.delete_annotations = True self.data_utils = DataUtils()
def get_all_data(batch_size, sentence_len, word2idx, label2idx, fold_num): utils = DataUtils(batch_size=batch_size, sentence_len=sentence_len, word2idx=word2idx, label2idx=label2idx) # 开发集 develop_sentences, develop_labels = utils.get_train_data( "./data/", mode='develop_') develop_idx_x_batches, develop_y_batches, develop_word_len_batches = utils.encoder_data2idx_batch( develop_sentences, develop_labels) # 测试集 test_sentences, test_labels = utils.get_train_data("./data/", mode='test_') test_idx_x_batches, test_y_batches, test_word_len_batches = utils.encoder_data2idx_batch( test_sentences, test_labels) # 训练集 train_sentences, train_labels = utils.get_train_data("./data/", mode='train_') # 训练集的5折 k_fold_x_train, k_fold_y_train, k_fold_x_test, k_fold_y_test = DataUtils.k_fold( train_sentences, train_labels, fold_num) # k 代表 训练集切分出来的数据 k_train_idx_x_batches_list, k_train_y_batches_list, k_train_word_len_batches_list = [], [], [] k_develop_idx_x_batches_list, k_develop_y_batches_list, k_develop_word_len_batches_list = [], [], [] if fold_num != 1: for fold_idx in range(fold_num): k_train_idx_x_batches, k_train_y_batches, k_train_word_len_batches = utils.encoder_data2idx_batch( k_fold_x_train[fold_idx], k_fold_y_train[fold_idx]) k_train_idx_x_batches_list.append(k_train_idx_x_batches) k_train_y_batches_list.append(k_train_y_batches) k_train_word_len_batches_list.append(k_train_word_len_batches) k_develop_idx_x_batches, k_develop_y_batches, k_develop_word_len_batches = utils.encoder_data2idx_batch( k_fold_x_test[fold_idx], k_fold_y_test[fold_idx]) k_develop_idx_x_batches_list.append(k_develop_idx_x_batches) k_develop_y_batches_list.append(k_develop_y_batches) k_develop_word_len_batches_list.append( k_develop_word_len_batches) else: k_train_idx_x_batches, k_train_y_batches, k_train_word_len_batches = utils.encoder_data2idx_batch( k_fold_x_train[0], k_fold_y_train[0]) k_train_idx_x_batches_list.append(k_train_idx_x_batches) k_train_y_batches_list.append(k_train_y_batches) k_train_word_len_batches_list.append(k_train_word_len_batches) return k_train_idx_x_batches_list, k_train_y_batches_list, k_train_word_len_batches_list, \ k_develop_idx_x_batches_list, k_develop_y_batches_list, k_develop_word_len_batches_list, \ develop_idx_x_batches, develop_y_batches, develop_word_len_batches, \ test_idx_x_batches, test_y_batches, test_word_len_batches,
def export_doc_ids(server, src_index, src_type, query=None): print __name__, 'Fetching doc ids for', server, src_index, src_type if query is None: query = { "match_all": {} } data_utils = DataUtils() ids = data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query) documents_ids = dict.fromkeys(ids, None) print __name__, 'Done, fetched', len(documents_ids), 'doc ids' return documents_ids
def __init__(self, reports_directory, src_server, src_index, src_type, dest_server, dest_index, dest_type): self.data_loader_utils_dest = DataLoaderUtils(dest_server, dest_index, dest_type) self.reports_directory = reports_directory self.src_server = src_server self.src_index = src_index self.src_type = src_type self.dest_server = dest_server self.dest_index = dest_index self.dest_type = dest_type self.copy_tags = True self.copy_annotations = True self.combine_tags = False # Combine not implemented, set to false self.combine_annotations = False # Combine not implemented, set to false self.data_utils = DataUtils()
def create_node(self, data_set, max_depth, feature_index_list, boost=False): """ Recursive function that constructs the decision tree by DFS approach by setting respective parameters and branching based on information gain and entropy. :param data_set: :param max_depth: :param feature_index_list: :return: """ # tree_node = None is_termination_condition, class_label = self.test_termination_condition(data_set, max_depth, feature_index_list) tree_node = TreeNode() eval_util = EvalUtil() data_util = DataUtils() feature_split_index = eval_util.get_split_attribute_index(data_set, feature_index_list, boost) # print feature_split_index tree_node.set_split_feature_index(feature_split_index) split_feature_values = data_util.get_feature_discrete_values(data_set, feature_split_index) # tree_node.set_pos_neg(positive, negative) tree_node.set_class_label(class_label) revised_index_list = [x for x in feature_index_list if x != feature_split_index] if not is_termination_condition: for value in split_feature_values: data_subset = data_util.get_data_subset(data_set, feature_split_index, value) if len(data_subset) > 0: child_node = self.create_node(data_subset, max_depth - 1, revised_index_list) tree_node.append_child(value, child_node) else: tree_node.append_child(value, None) return tree_node
def load_data(self): self.du = DataUtils(self.config) self.embedding_size = self.du.embedding_size self.sentence_size = self.du.sentence_size self.memory_size = self.du.memory_size self.vocab_size = self.du.vocab_size self.vocab = self.du.vocab self.trainS = self.du.trainS self.trainQ = self.du.trainQ self.trainA = self.du.trainA self.valS = self.du.valS self.valQ = self.du.valQ self.valA = self.du.valA self.testS = self.du.testS self.testQ = self.du.testQ self.testA = self.du.testA self.train_labels = self.du.train_labels self.val_labels = self.du.val_labels self.test_labels = self.du.test_labels self.data_length = len(self.du.data)
def setup_and_verify(self): parser = argparse.ArgumentParser( description= 'trec 2019 deep learning track (document re-ranking task)') torch.set_printoptions(threshold=500) self.data_utils = DataUtils(self.printer) self.model_utils = NDRMUtils(self.printer) self.learner_utils = LearnerUtils(self.printer) self.sub_utils = [ self.data_utils, self.model_utils, self.learner_utils ] for sub_utils in self.sub_utils: sub_utils.parent = self sub_utils.parser_add_args(parser) self.args = parser.parse_args() for sub_utils in self.sub_utils: sub_utils.parser_validate_args(self.args) self.__print_versions() for sub_utils in self.sub_utils: sub_utils.setup_and_verify() self.__print_args()
def test_termination_condition(self, data_set, max_depth, feature_index_list): """ Function that tests termination condition for a tree to expand. The termination is reached if max depth supplied is reached, or feature index list is exhausted, or the class is pure class. :param data_set: :param max_depth: :param feature_index_list: :return: """ termination = True d_util = DataUtils() class_label, is_pure_class = d_util.get_class_label(data_set) if max_depth > 0 and feature_index_list is not None and len(feature_index_list) > 0 and not is_pure_class: termination = False return termination, class_label
# -*- coding: utf-8 import os import numpy as np import xgboost as xgb from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import classification_report, f1_score, precision_score, recall_score from sklearn.multiclass import OneVsRestClassifier from sklearn.externals import joblib from data_utils import DataUtils cur_dir = os.path.dirname(__file__) d_utils = DataUtils(filepath=os.path.join(cur_dir, "data/pKaInWater.csv")) X_data, y_data_acidic, y_data_basic = d_utils.get_classification_data( feature_type="morgan+macc") y_data = np.array([y_data_acidic, y_data_basic]).T # train test split seed = 7 X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=seed) print("\n ========================= \n") print("X_train.shape:", X_train.shape, "X_test.shape", X_test.shape) print("\n ========================= \n") def model_evaluation(model, x_input, y_input): y_pred = model.predict(x_input) print(classification_report(y_true=y_input, y_pred=y_pred))
def train(): alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam print('======== experiment settings =========') print('alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d' % (alpha, beta, gamma, lam, args.p, args.ws, args.ns,args.maxT,args.minT,args.max_iter, args.d)) print('========== processing data ===========') dul = DataUtils(model_path) if args.rec: test_user, test_item, test_rate = dul.read_data(args.test_data) print("constructing graph....") gul = GraphUtils(model_path) gul.construct_training_graph(args.train_data) edge_dict_u = gul.edge_dict_u edge_list = gul.edge_list walk_generator(gul,args) print("getting context and negative samples....") context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples(gul, args) node_list_u, node_list_v = {}, {} init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args) last_loss, count, epsilon = 0, 0, 1e-3 print("============== training ==============") for iter in range(0, args.max_iter): s1 = "\r[%s%s]%0.2f%%"%("*"* iter," "*(args.max_iter-iter),iter*100.0/(args.max_iter-1)) loss = 0 visited_u = dict(zip(node_list_u.keys(), [0] * len(node_list_u.keys()))) visited_v = dict(zip(node_list_v.keys(), [0] * len(node_list_v.keys()))) random.shuffle(edge_list) for i in range(len(edge_list)): u, v, w = edge_list[i] length = len(context_dict_u[u]) random.shuffle(context_dict_u[u]) if visited_u.get(u) < length: # print(u) index_list = list(range(visited_u.get(u),min(visited_u.get(u)+1,length))) for index in index_list: context_u = context_dict_u[u][index] neg_u = neg_dict_u[u][index] # center,context,neg,node_list,eta for z in context_u: tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u, lam, alpha) node_list_u[z]['embedding_vectors'] += tmp_z loss += tmp_loss visited_u[u] = index_list[-1]+3 length = len(context_dict_v[v]) random.shuffle(context_dict_v[v]) if visited_v.get(v) < length: # print(v) index_list = list(range(visited_v.get(v),min(visited_v.get(v)+1,length))) for index in index_list: context_v = context_dict_v[v][index] neg_v = neg_dict_v[v][index] # center,context,neg,node_list,eta for z in context_v: tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v, lam, beta) node_list_v[z]['embedding_vectors'] += tmp_z loss += tmp_loss visited_v[v] = index_list[-1]+3 update_u, update_v, tmp_loss = KL_divergence(edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma) loss += tmp_loss node_list_u[u]['embedding_vectors'] += update_u node_list_v[v]['embedding_vectors'] += update_v delta_loss = abs(loss - last_loss) if last_loss > loss: lam *= 1.05 else: lam *= 0.95 last_loss = loss if delta_loss < epsilon: break sys.stdout.write(s1) sys.stdout.flush()
def train_by_sampling(args): model_path = os.path.join('../', args.model_name) if os.path.exists(model_path) is False: os.makedirs(model_path) alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam print('======== experiment settings =========') print( 'alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d' % (alpha, beta, gamma, lam, args.p, args.ws, args.ns, args.maxT, args.minT, args.max_iter, args.d)) print('========== processing data ===========') dul = DataUtils(model_path) if args.rec: test_user, test_item, test_rate = dul.read_data(args.test_data) print("constructing graph....") gul = GraphUtils(model_path) gul.construct_training_graph( args.train_data) # train_data='../data/wiki/rating_train.dat' edge_dict_u = gul.edge_dict_u # dict形式的点边关系 edge_list = gul.edge_list # list形式的点边关系 walk_generator(gul, args) # 生成随机游走 print("getting context and negative samples....") context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples( gul, args) node_list_u, node_list_v = {}, {} init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args) # 初始化节点embedding last_loss, count, epsilon = 0, 0, 1e-3 print("============== training ==============") for iter in range(0, args.max_iter): s1 = "\r[%s%s]%0.2f%%" % ("*" * iter, " " * (args.max_iter - iter), iter * 100.0 / (args.max_iter - 1)) loss = 0 visited_u = dict(zip(node_list_u.keys(), [0] * len(node_list_u.keys()))) # u类别初始为0 visited_v = dict(zip(node_list_v.keys(), [0] * len(node_list_v.keys()))) # v类别初始为0 random.shuffle(edge_list) # edge_list: 点边信息 for i in range(len(edge_list)): u, v, w = edge_list[i] length = len(context_dict_u[u]) # 周围邻居的数量 random.shuffle(context_dict_u[u]) if visited_u.get(u) < length: # print(u) index_list = list( range(visited_u.get(u), min(visited_u.get(u) + 1, length))) for index in index_list: context_u = context_dict_u[u][index] # 选择节点的一个邻居 neg_u = neg_dict_u[u][ index] # 选择节点的负采样信息; 负采样本身就是随机的,所以只需打乱context即可,并且多个epoch训练时,负采样样本也不同 # center,context,neg,node_list,eta for z in context_u: # 每一个邻居节点,都进行skip-gram更新embedding tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u, lam, alpha) node_list_u[u][ 'embedding_vectors'] += tmp_z # 更新节点embedding loss += tmp_loss visited_u[u] = index_list[-1] + 3 length = len(context_dict_v[v]) random.shuffle(context_dict_v[v]) if visited_v.get(v) < length: # print(v) index_list = list( range(visited_v.get(v), min(visited_v.get(v) + 1, length))) for index in index_list: context_v = context_dict_v[v][index] neg_v = neg_dict_v[v][index] # center,context,neg,node_list,eta for z in context_v: tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v, lam, beta) node_list_v[v]['embedding_vectors'] += tmp_z loss += tmp_loss visited_v[v] = index_list[-1] + 3 # edge_dict_u:边连接的信息 update_u, update_v, tmp_loss = KL_divergence( edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma) # 计算KL-deversion loss += tmp_loss node_list_u[u]['embedding_vectors'] += update_u node_list_v[v]['embedding_vectors'] += update_v # 求的是梯度上升,loss越大越好 delta_loss = abs(loss - last_loss) if last_loss > loss: lam *= 1.05 else: lam *= 0.95 last_loss = loss if delta_loss < epsilon: break sys.stdout.write(s1) sys.stdout.flush() save_to_file(node_list_u, node_list_v, model_path, args) print("") if args.rec: print("============== testing ===============") f1, map, mrr, mndcg = top_N(test_user, test_item, test_rate, node_list_u, node_list_v, args.top_n) print( 'recommendation metrics: F1 : %0.4f, MAP : %0.4f, MRR : %0.4f, NDCG : %0.4f' % (round(f1, 4), round(map, 4), round(mrr, 4), round(mndcg, 4))) if args.lip: print("============== testing ===============") auc_roc, auc_pr = link_prediction(args) print('link prediction metrics: AUC_ROC : %0.4f, AUC_PR : %0.4f' % (round(auc_roc, 4), round(auc_pr, 4)))
# if curLabel[index] in self.iterativelyTest(data, self.tree): # counter += 1 return counter / len(curLabel) class Node(object): def __init__(self, isLeaf, attribute, threshold): self.isLeaf = isLeaf self.attribute = attribute self.threshold = threshold self.children = [] if __name__ == '__main__': from data_utils import DataUtils trainfile_X = 'train-images-idx3-ubyte' trainfile_y = 'train-labels-idx1-ubyte' testfile_X = 't10k-images-idx3-ubyte' testfile_y = 't10k-labels-idx1-ubyte' train_X = DataUtils(filename=trainfile_X).getImage() train_y = DataUtils(filename=trainfile_y).getLabel() test_X = DataUtils(testfile_X).getImage() test_y = DataUtils(testfile_y).getLabel() a = C45() a.fit(train_X, train_y) #a.visualize2(a.tree) print(a.test(test_X, test_y))
def step_experiment(args, max_n=5, test_size=0.3): du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split( pos, test_size=test_size, random_state=seed) neg_train, neg_test = train_test_split( neg, test_size=test_size, random_state=seed) ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name) if args.name in ['member']: N_max_list = [3, 6, 9, 12] else: N_max_list = [10, 15, 20, 25, 30, 35, 40] if args.name in ['subtree']: N_beam = 15 T_beam = 3 else: N_beam = 10 T_beam = 7 AUCs = [] AUC_stds = [] MSEs = [] MSE_stds = [] N = 5 # how many times to perform weight learn naive_AUCs = [] naive_AUC_stds = [] naive_MSEs = [] naive_MSE_stds = [] for N_max in N_max_list: CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N( N=N, gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) auc_list = np.array( [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list]) auc_mean = np.mean(auc_list) auc_std = np.std(auc_list) AUCs.append(auc_mean) AUC_stds.append(auc_std) mse_list = np.array( [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list]) mse_mean = np.mean(mse_list) mse_std = np.std(mse_list) MSEs.append(mse_mean) MSE_stds.append(mse_std) # NAIVE CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, naive_loss_list_list = solver.train_N( N=N, gen_mode='naive', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) auc_list = np.array( [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list]) auc_mean = np.mean(auc_list) auc_std = np.std(auc_list) naive_AUCs.append(auc_mean) naive_AUC_stds.append(auc_std) mse_list = np.array( [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list]) mse_mean = np.mean(mse_list) mse_std = np.std(mse_list) naive_MSEs.append(mse_mean) naive_MSE_stds.append(mse_std) for j in range(N): loss_path = 'imgs/step/loss/' + args.name + \ '[N_max:' + str(N_max) + ']-' + str(j) + '.pdf' ys_list = [loss_list_list[j], naive_loss_list_list[j]] plot_loss_compare(loss_path, ys_list, args.name + ':[N_max:' + str(N_max) + ']-' + str(j)) path_auc = 'imgs/step/' + args.name + '_AUC.pdf' path_mse = 'imgs/step/' + args.name + '_MSE.pdf' print(AUC_stds) print(MSE_stds) labels = ['proposed', 'naive'] plot_line_graph_compare_err(path=path_auc, xs=N_max_list, ys_list=[AUCs, naive_AUCs], err_list=[AUC_stds, naive_AUC_stds], xlabel='Number of clauses', ylabel='AUC', title=args.name, labels=labels) plot_line_graph_compare_err(path=path_mse, xs=N_max_list, ys_list=[MSEs, naive_MSEs], err_list=[MSE_stds, naive_MSE_stds], xlabel='Number of clauses', ylabel='Mean-squared test error', title=args.name, labels=labels)
def noise_experiment(args, test_size=0.3): du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=seed) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=seed) noise_rates = [ 0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50 ] baseline_auc = [1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5] datasets = get_datasets_with_noise(pos_train, neg_train, noise_rates) AUCs = [] AUC_stds = [] MSEs = [] MSE_stds = [] N = 5 # how many times to perform weight learn if args.name == 'member': T_beam = 3 N_beam = 3 elif args.name == 'subtree': T_beam = 3 N_beam = 15 else: T_beam = 5 N_beam = 10 N_max = 50 for i, (pos_train, neg_train) in enumerate(datasets): ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name) print('NOISE RATE: ', noise_rates[i]) ilp_train.print() CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) clauses_, Ws_list, loss_list_list = solver.train_N(N=N, gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list) auc_list = np.array( [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list]) auc_mean = np.mean(auc_list) auc_std = np.std(auc_list) AUCs.append(auc_mean) AUC_stds.append(auc_std) mse_list = np.array( [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list]) mse_mean = np.mean(mse_list) mse_std = np.std(mse_list) MSEs.append(mse_mean) MSE_stds.append(mse_std) for j in range(N): loss_path = 'imgs/noise/loss/' + args.name + \ '[noise:' + str(noise_rates[i]) + ']-' + str(j) + '.pdf' plot_loss( loss_path, loss_list_list[j], args.name + ':[noise:' + str(noise_rates[i]) + ']-' + str(j)) # plot AUC with baseline path_auc = 'imgs/noise/' + args.name + '_AUC.pdf' path_mse = 'imgs/noise/' + args.name + '_MSE.pdf' print(AUC_stds) print(MSE_stds) plot_line_graph_baseline_err( path=path_auc, xs=noise_rates, ys=AUCs, err=AUC_stds, xlabel='Proportion of mislabeled training data', ylabel='AUC', title=args.name, baseline=baseline_auc) # plot MSR with std plot_line_graph_err(path=path_mse, xs=noise_rates, ys=MSEs, err=MSE_stds, xlabel='Proportion of mislabeled training data', ylabel='Mean-squared test error', title=args.name)
def train(args): model_path = os.path.join('../', args.model_name) if os.path.exists(model_path) is False: os.makedirs(model_path) alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam print('======== experiment settings =========') print( 'alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d' % (alpha, beta, gamma, lam, args.p, args.ws, args.ns, args.maxT, args.minT, args.max_iter, args.d)) print('========== processing data ===========') dul = DataUtils(model_path) if args.rec: test_user, test_item, test_rate = dul.read_data(args.test_data) print("constructing graph....") gul = GraphUtils(model_path) gul.construct_training_graph(args.train_data) edge_dict_u = gul.edge_dict_u edge_list = gul.edge_list walk_generator(gul, args) print("getting context and negative samples....") context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples( gul, args) node_list_u, node_list_v = {}, {} init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args) last_loss, count, epsilon = 0, 0, 1e-3 print("============== training ==============") for iter in range(0, args.max_iter): s1 = "\r[%s%s]%0.2f%%" % ("*" * iter, " " * (args.max_iter - iter), iter * 100.0 / (args.max_iter - 1)) loss = 0 num = 0 visited_u = dict(zip(node_list_u.keys(), [0] * len(node_list_u.keys()))) visited_v = dict(zip(node_list_v.keys(), [0] * len(node_list_v.keys()))) random.shuffle(edge_list) for (u, v, w) in edge_list: if visited_u.get(u) == 0 or random.random() > 0.95: # print(u) length = len(context_dict_u[u]) index_list = random.sample(list(range(length)), min(length, 1)) for index in index_list: context_u = context_dict_u[u][index] neg_u = neg_dict_u[u][index] # center,context,neg,node_list,eta for k, z in enumerate(context_u): tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u, lam, alpha) node_list_u[z]['embedding_vectors'] += tmp_z loss += tmp_loss visited_u[u] = 1 if visited_v.get(v) == 0 or random.random() > 0.95: # print(v) length = len(context_dict_v[v]) index_list = random.sample(list(range(length)), min(length, 1)) for index in index_list: context_v = context_dict_v[v][index] neg_v = neg_dict_v[v][index] # center,context,neg,node_list,eta for k, z in enumerate(context_v): tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v, lam, beta) node_list_v[z]['embedding_vectors'] += tmp_z loss += tmp_loss visited_v[v] = 1 # print(len(edge_dict_u)) update_u, update_v, tmp_loss = KL_divergence( edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma) loss += tmp_loss node_list_u[u]['embedding_vectors'] += update_u node_list_v[v]['embedding_vectors'] += update_v count = iter num += 1 delta_loss = abs(loss - last_loss) if last_loss > loss: lam *= 1.05 else: lam *= 0.95 last_loss = loss if delta_loss < epsilon: break sys.stdout.write(s1) sys.stdout.flush() save_to_file(node_list_u, node_list_v, model_path, args) print("") if args.rec: print("============== testing ===============") f1, map, mrr, mndcg = top_N(test_user, test_item, test_rate, node_list_u, node_list_v, args.top_n) print( 'recommendation metrics: F1 : %0.4f, MAP : %0.4f, MRR : %0.4f, NDCG : %0.4f' % (round(f1, 4), round(map, 4), round(mrr, 4), round(mndcg, 4))) if args.lip: print("============== testing ===============") auc_roc, auc_pr = link_prediction(args) print('link prediction metrics: AUC_ROC : %0.4f, AUC_PR : %0.4f' % (round(auc_roc, 4), round(auc_pr, 4)))
program_start = time() root_path = r"D:\FF120\workspace\Python\data\MINIST" #MINIST文件路径 save_to_disk = False #是否生成图像保存到硬盘 if os.path.exists(root_path + "\\train_X.npy"): train_X = np.load(root_path + '\\train_X.npy') train_y = np.load(root_path + '\\train_y.npy') test_X = np.load(root_path + '\\test_X.npy') test_y = np.load(root_path + '\\test_y.npy') else: trainfile_X = root_path + '\\train-images.idx3-ubyte' trainfile_y = root_path + '\\train-labels.idx1-ubyte' testfile_X = root_path + '\\t10k-images.idx3-ubyte' testfile_y = root_path + '\\t10k-labels.idx1-ubyte' train_X = DataUtils(filename=trainfile_X).getImage() train_y = DataUtils(filename=trainfile_y).getLabel() test_X = DataUtils(testfile_X).getImage() test_y = DataUtils(testfile_y).getLabel() np.save(root_path + "\\train_X.npy", train_X) np.save(root_path + "\\train_y.npy", train_y) np.save(root_path + "\\test_X.npy", test_X) np.save(root_path + "\\test_y.npy", test_y) #以下内容是将图像保存到本地文件中 if save_to_disk: t0 = time.time() path_trainset = root_path + "\\imgs_train" path_testset = root_path + "\\imgs_test" if not os.path.exists(path_trainset):
def softor_experiment(args, max_n=5, test_size=0.3): du = DataUtils(args.name) pos, neg, bk, clauses, lang = du.load_data() pos_train, pos_test = train_test_split(pos, test_size=test_size, random_state=7014) neg_train, neg_test = train_test_split(neg, test_size=test_size, random_state=7014) ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name) N_max = 50 if args.name in ['member']: T_beam = 3 N_beam = 3 m = 3 elif args.name in ['subtree']: T_beam = 3 N_beam = 15 m = 4 else: T_beam = 5 N_beam = 10 m = 3 CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T) #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T, im_mode='softmax') clauses_, Ws_, loss_list, times = solver.train_time(gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) print('Ws: ') for W in Ws_: print(F.softmax(W, dim=0)) v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_) auc = compute_auc(pos_test, neg_test, v_, facts) mse = compute_mse(pos_test, neg_test, v_, facts) ent = compute_ent(Ws_, gen_mode='softmax') print('ENT:', ent) print('AUC:', auc) df = {} df['AUC'] = auc df['N_params'] = solver.count_params() df['time'] = mean(times) df['std'] = stdev(times) df['MSE'] = mse df['ENT'] = compute_ent(Ws_, gen_mode='softmax') path = 'results/' + args.name + '_softor' + '.txt' save(path, df) # PAIR CG = ClauseGenerator(ilp_train, infer_step=args.T, max_depth=1, max_body_len=1) solver = ILPSolver(ilp_train, C_0=clauses, CG=CG, m=args.m, infer_step=args.T, im_mode='pair') #solver = ILPSolver(ilp_train, C_0=clauses, m=2, infer_step=args.T, im_mode='pair') clauses_, Ws_, pair_loss_list, times = solver.train_time(gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0) print('Ws: ') print(softmax2d(Ws_[0])) v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_) auc = compute_auc(pos_test, neg_test, v_, facts) mse = compute_mse(pos_test, neg_test, v_, facts) df = {} df['AUC_pair'] = auc df['N_params_pair'] = solver.count_params() df['time_pair'] = mean(times) df['std_pair'] = stdev(times) df['MSE_pair'] = mse df['ENT_pair'] = compute_ent(Ws_, gen_mode='pair') path = 'results/' + args.name + '_pair' + '.txt' save(path, df) print(df) loss_path = 'imgs/softor/loss/' + args.name + '.pdf' ys_list = [loss_list, pair_loss_list] plot_loss_compare(loss_path, ys_list, args.name)
def __init__(self): self.data_utils = DataUtils() self.transfer_learner = TransferLearner() self.analyser = Analyser()
def run(epoches, batch_size, order, learnrate, learnrate2, decay, decay2): trainfile_X = 'train-images-idx3-ubyte' trainfile_y = 'train-labels-idx1-ubyte' testfile_X = 't10k-images-idx3-ubyte' testfile_y = 't10k-labels-idx1-ubyte' train_X = DataUtils(filename=trainfile_X).getImage() train_y = DataUtils(filename=trainfile_y).getLabel() test_X = DataUtils(testfile_X).getImage() test_y = DataUtils(testfile_y).getLabel() train_y = normalize(train_y) test_y = normalize(test_y) train_X = np.expand_dims(train_X, axis=-1) test_X = np.expand_dims(test_X, axis=-1) train_y = np.expand_dims(train_y, axis=-1) test_y = np.expand_dims(test_y, axis=-1) epoches = epoches batch_size = batch_size batches = int(len(train_X) / batch_size) order = order mu = 0.0 sigma = 0.1 image_d = 784 neuron_d_1 = 256 neuron_d_2 = 128 neuron_d_output = 10 dimension_1 = [neuron_d_1, image_d] weights_1 = create_weights(mu, sigma, dimension_1) bias_1 = np.zeros((neuron_d_1, 1)) dimension_2 = [neuron_d_2, neuron_d_1] weights_2 = create_weights(mu, sigma, dimension_2) bias_2 = np.zeros((neuron_d_2, 1)) dimension_3 = [neuron_d_output, neuron_d_2] weights_3 = create_weights(mu, sigma, dimension_3) bias_3 = np.zeros((neuron_d_output, 1)) weights = [weights_1, weights_2, weights_3] bias = [bias_1, bias_2, bias_3] for epoch in range(epoches): learnrate -= learnrate * decay learnrate2 -= learnrate2 * decay2 #train_X, train_y = shuffle(train_X,train_y,random_state=0) for i in range(batches): loss = 0 accuracy = 0 start = i * batch_size end = (i + 1) * batch_size x = np.concatenate(train_X[start:end], axis=-1) y = np.concatenate(train_y[start:end], axis=-1) layer_i, layer_o = forward(x, weights, bias) layer_b_w, layer_b = backward(x, y, layer_i, layer_o, order, weights, bias) if order == 1: weights_3 = np.add(weights_3, learnrate * layer_b_w[0]) weights_2 = np.add(weights_2, learnrate * layer_b_w[1]) weights_1 = np.add(weights_1, learnrate * layer_b_w[2]) bias_3 = np.mean(np.add(bias_3, learnrate * layer_b[0]), axis=-1, keepdims=True) bias_2 = np.mean(np.add(bias_2, learnrate * layer_b[1]), axis=-1, keepdims=True) bias_1 = np.mean(np.add(bias_1, learnrate * layer_b[2]), axis=-1, keepdims=True) elif order == 2: weights_3 = np.add( np.add(weights_3, learnrate * layer_b_w[0][0]), -learnrate2 * layer_b_w[0][1]) weights_2 = np.add( np.add(weights_2, learnrate * layer_b_w[1][0]), -learnrate2 * layer_b_w[1][1]) weights_1 = np.add( np.add(weights_1, learnrate * layer_b_w[2][0]), -learnrate2 * layer_b_w[2][1]) bias_3 = np.mean(np.add( np.add(bias_3, learnrate * layer_b[0][0]), -learnrate2 * layer_b[0][1]), axis=-1, keepdims=True) bias_2 = np.mean(np.add( np.add(bias_2, learnrate * layer_b[1][0]), -learnrate2 * layer_b[1][1]), axis=-1, keepdims=True) bias_1 = np.mean(np.add( np.add(bias_1, learnrate * layer_b[2][0]), -learnrate2 * layer_b[2][1]), axis=-1, keepdims=True) weights = [weights_1, weights_2, weights_3] bias = [bias_1, bias_2, bias_3] loss = sum(sum(abs(np.subtract(y, layer_o[2])))) for col in range(batch_size): accuracy += int(layer_o[2][:, col].argmax() == y[:, col].argmax()) accuracy = accuracy / batch_size print('epoch:{}, batch:{} , loss:{}, accuracy:{}'.format( epoch, i, loss, accuracy)) accuracy = 0 accuracy = 0 X = np.concatenate(test_X, axis=-1) Y = np.concatenate(test_y, axis=-1) _, output = forward(X, weights, bias) for i in range(len(Y[0])): accuracy += int(output[2][:, i].argmax() == Y[:, i].argmax()) print('test accuracy:{}'.format(float(accuracy / len(Y[0])))) del weights del bias return accuracy
from data_utils import DataUtils dataUtils = DataUtils() if __name__ == '__main__': dataUtils.pre_process_aws(0.01) dataUtils.pre_process_aws(0.1) dataUtils.pre_process_aws(1) dataUtils.pre_process_col_tweets()
# coding = utf-8 import torch import time from cvae import ContinuousAgent, ContinuousVAE from data_utils import DataUtils from config import RunConfig as Config if __name__ == "__main__": config = Config() api = DataUtils(config.ctx_encode_method) api.load_vocab() api.load_candidates() api.load_dialog(config.coming_task, config.system_mode) api.build_pad_config(config.memory_size) model = ContinuousVAE(config, api) if config.trained_model is not None: print("Using trained model in {}".format(config.trained_model)) model.load_state_dict(torch.load(config.trained_model)) agent = ContinuousAgent(config, model, api) t1 = time.time() agent.main() t2 = time.time() print("cost time: {} seconds.".format(t2 - t1))