Ejemplo n.º 1
0
def main():
    data_utils = DataUtils()
    clf_utils = ClassifierUtils()
    decision_documents, decision_labels = data_utils.load_decision_data()
    disagreement_documents, disagreement_labels = data_utils.load_disagreement_data(
    )
    clf_metadata = {
        'type': 'RF',
        'n_estimators': 500,
        'max_depth': 128,
        'n_jobs': 8
    }
    features_metadata = {
        'type': 'count',
        'use_sw': True,
        'use_length': False,
        'binary': False,
        'normalize': False,
        'append_binary': False,
        'sampling': None
    }

    metrics = clf_utils.cross_validate(disagreement_documents,
                                       disagreement_labels,
                                       clf_metadata,
                                       features_metadata,
                                       num_splits=5)

    embed()
Ejemplo n.º 2
0
    def __init__(self, config=config_reader()):
        """
        read model param
        """
        self.rnn_mode = config['rnn_mode']
        self.batch_size = config['batch_size']
        self.embedding_dim = config['embedding_dim']
        self.num_layers = config['num_layers']
        self.num_units = config['num_utils']
        self.FCNN_num_units = config['FCNN_num_units']
        self.learning_rate = config['learning_rate']
        self.max_epoch = config['max_epoch']
        self.keep_prob = config['keep_prob']
        self.model_path = config['model_path']
        self.logs_file = config['logs_file']
        self.end_loss = config['end_loss']
        self.save_model_name = config['save_model_name']
        self.print_step = config['print_step']
        self.save_epoch = config['save_epoch']

        self.data_utils = DataUtils()
        self.vocab = self.data_utils.vocab
        self.chunk_size = self.data_utils.chunk_size
        self.global_step = tf.Variable(0, trainable=False, name='global_step')
        self.increment_global_step_op = tf.assign(self.global_step,
                                                  self.global_step + 1)
Ejemplo n.º 3
0
 def load_data(self):
     self.du = DataUtils(self.config.training_file,
                         self.config.testing_file, self.config.batch_size)
     self.X_train = self.du.train_images
     self.y_train = self.du.train_labels
     self.X_val = self.du.val_images
     self.y_val = self.du.val_labels
     self.X_test = self.du.test_images
     self.y_test = self.du.test_labels
Ejemplo n.º 4
0
def main(path, graphics):

    t = DataUtils(path)
    train, test = t.train, t.test
    for _t in train + test:
        inp, out = _t['input'], _t['output']
        inp, out = np.asarray(inp), np.asarray(out)
        output_array = solve(inp, out, graphics)
        print(output_array)
Ejemplo n.º 5
0
	def __init__(self):
		self.num_classes = 2
		self.resnet50_weights = os.path.realpath('models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5')
		self.xception_weights = os.path.realpath('models/xception_weights_tf_dim_ordering_tf_kernels_notop.h5')
		self.model_output_path = os.path.realpath('data/model_output.h5')
		self.model_path = {'resnet50': os.path.realpath('data/model_resnet50.h5'),
						   'xception': os.path.realpath('data/model_xception.h5')}
		self.transfer_classifiers = {'resnet50': (ResNet50, self.resnet50_weights),
		                             'xception': (Xception, self.xception_weights)}
		self.du = DataUtils()
Ejemplo n.º 6
0
	def test(self, classifier, model=None):
		du = DataUtils()
		X_test, y_test = du.data_preprocess('test')
		pred = self.predict(X_test, classifier, model)
		y_pred = np.zeros(len(pred), dtype=int)
		y_pred[pred[:, 1] > pred[:, 0]] = 1
		score = metrics.accuracy_score(y_test[:, 1], y_pred)
		logger_tc.info('test accuracy: %.3f' % score)
		with h5py.File(self.model_output_path) as model_output:
			if '%s_test_pred' % classifier not in model_output:
				model_output.create_dataset('%s_test_pred' % classifier, data=pred)
Ejemplo n.º 7
0
def normal_experiment(args):
    test_size = 0.3
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(pos,
                                           test_size=test_size,
                                           random_state=7014)
    neg_train, neg_test = train_test_split(neg,
                                           test_size=test_size,
                                           random_state=7014)

    pos_train_, neg_train_ = get_dataset_with_noise(pos_train,
                                                    neg_train,
                                                    noise_rate=args.noise_rate)

    if args.name == 'member':
        beam_step = 3
        N_beam = 3
    elif args.name == 'subtree':
        beam_step = 3
        N_beam = 15
    else:
        beam_step = 5
        N_beam = 10
    N_max = 50
    N = 1

    ilp_train = ILPProblem(pos_train_, neg_train_, bk, lang, name=args.name)
    ilp_train.print()
    CG = ClauseGenerator(ilp_train,
                         infer_step=args.T,
                         max_depth=1,
                         max_body_len=1)
    solver = ILPSolver(ilp_train,
                       C_0=clauses,
                       CG=CG,
                       m=args.m,
                       infer_step=args.T)
    clauses_, Ws_list, loss_list_list = solver.train_N(N=N,
                                                       gen_mode='beam',
                                                       N_max=N_max,
                                                       T_beam=beam_step,
                                                       N_beam=N_beam,
                                                       epoch=args.epoch,
                                                       lr=args.lr,
                                                       wd=0.0)
    v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)
    mse = compute_mse(pos_test, neg_test, v_list[0], facts)
    auc = compute_auc(pos_test, neg_test, v_list[0], facts)

    print('====== TEST SCORE =======')
    print('Mean-squared test error: ', mse)
    print('AUC: ', auc)
    def __init__(self, reports_directory, src_server, src_index, src_type):
        self.data_loader_utils_dest = DataLoaderUtils(src_server, src_index, src_type)
        self.reports_directory = reports_directory

        self.src_server = src_server
        self.src_index = src_index
        self.src_type = src_type

        self.delete_tags = True
        self.delete_annotations = True

        self.data_utils = DataUtils()
Ejemplo n.º 9
0
    def get_all_data(batch_size, sentence_len, word2idx, label2idx, fold_num):
        utils = DataUtils(batch_size=batch_size,
                          sentence_len=sentence_len,
                          word2idx=word2idx,
                          label2idx=label2idx)

        # 开发集
        develop_sentences, develop_labels = utils.get_train_data(
            "./data/", mode='develop_')
        develop_idx_x_batches, develop_y_batches, develop_word_len_batches = utils.encoder_data2idx_batch(
            develop_sentences, develop_labels)

        # 测试集
        test_sentences, test_labels = utils.get_train_data("./data/",
                                                           mode='test_')
        test_idx_x_batches, test_y_batches, test_word_len_batches = utils.encoder_data2idx_batch(
            test_sentences, test_labels)
        # 训练集
        train_sentences, train_labels = utils.get_train_data("./data/",
                                                             mode='train_')
        # 训练集的5折
        k_fold_x_train, k_fold_y_train, k_fold_x_test, k_fold_y_test = DataUtils.k_fold(
            train_sentences, train_labels, fold_num)
        # k 代表 训练集切分出来的数据
        k_train_idx_x_batches_list, k_train_y_batches_list, k_train_word_len_batches_list = [], [], []
        k_develop_idx_x_batches_list, k_develop_y_batches_list, k_develop_word_len_batches_list = [], [], []

        if fold_num != 1:
            for fold_idx in range(fold_num):
                k_train_idx_x_batches, k_train_y_batches, k_train_word_len_batches = utils.encoder_data2idx_batch(
                    k_fold_x_train[fold_idx], k_fold_y_train[fold_idx])
                k_train_idx_x_batches_list.append(k_train_idx_x_batches)
                k_train_y_batches_list.append(k_train_y_batches)
                k_train_word_len_batches_list.append(k_train_word_len_batches)

                k_develop_idx_x_batches, k_develop_y_batches, k_develop_word_len_batches = utils.encoder_data2idx_batch(
                    k_fold_x_test[fold_idx], k_fold_y_test[fold_idx])
                k_develop_idx_x_batches_list.append(k_develop_idx_x_batches)
                k_develop_y_batches_list.append(k_develop_y_batches)
                k_develop_word_len_batches_list.append(
                    k_develop_word_len_batches)
        else:
            k_train_idx_x_batches, k_train_y_batches, k_train_word_len_batches = utils.encoder_data2idx_batch(
                k_fold_x_train[0], k_fold_y_train[0])
            k_train_idx_x_batches_list.append(k_train_idx_x_batches)
            k_train_y_batches_list.append(k_train_y_batches)
            k_train_word_len_batches_list.append(k_train_word_len_batches)
        return k_train_idx_x_batches_list, k_train_y_batches_list, k_train_word_len_batches_list, \
               k_develop_idx_x_batches_list, k_develop_y_batches_list, k_develop_word_len_batches_list, \
               develop_idx_x_batches, develop_y_batches, develop_word_len_batches, \
               test_idx_x_batches, test_y_batches, test_word_len_batches,
Ejemplo n.º 10
0
def export_doc_ids(server, src_index, src_type, query=None):
    print __name__, 'Fetching doc ids for', server, src_index, src_type
    if query is None:
        query = {
            "match_all": {}
        }

    data_utils = DataUtils()
    ids = data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query)

    documents_ids = dict.fromkeys(ids, None)
    print __name__, 'Done, fetched', len(documents_ids), 'doc ids'

    return documents_ids
    def __init__(self, reports_directory, src_server, src_index, src_type, dest_server, dest_index, dest_type):
        self.data_loader_utils_dest = DataLoaderUtils(dest_server, dest_index, dest_type)
        self.reports_directory = reports_directory

        self.src_server = src_server
        self.src_index = src_index
        self.src_type = src_type

        self.dest_server = dest_server
        self.dest_index = dest_index
        self.dest_type = dest_type

        self.copy_tags = True
        self.copy_annotations = True

        self.combine_tags = False # Combine not implemented, set to false  
        self.combine_annotations = False # Combine not implemented, set to false 

        self.data_utils = DataUtils()
    def create_node(self, data_set, max_depth, feature_index_list, boost=False):
        """
        Recursive function that constructs the decision tree by DFS approach by setting respective parameters
        and branching based on information gain and entropy.
        :param data_set:
        :param max_depth:
        :param feature_index_list:
        :return:
        """

        # tree_node = None

        is_termination_condition, class_label = self.test_termination_condition(data_set,
                                                                                max_depth, feature_index_list)

        tree_node = TreeNode()
        eval_util = EvalUtil()
        data_util = DataUtils()

        feature_split_index = eval_util.get_split_attribute_index(data_set, feature_index_list, boost)

        # print feature_split_index
        tree_node.set_split_feature_index(feature_split_index)

        split_feature_values = data_util.get_feature_discrete_values(data_set, feature_split_index)

        # tree_node.set_pos_neg(positive, negative)

        tree_node.set_class_label(class_label)

        revised_index_list = [x for x in feature_index_list if x != feature_split_index]

        if not is_termination_condition:
            for value in split_feature_values:
                data_subset = data_util.get_data_subset(data_set, feature_split_index, value)
                if len(data_subset) > 0:
                    child_node = self.create_node(data_subset, max_depth - 1, revised_index_list)
                    tree_node.append_child(value, child_node)
                else:
                    tree_node.append_child(value, None)

        return tree_node
Ejemplo n.º 13
0
 def load_data(self):
     self.du = DataUtils(self.config)
     self.embedding_size = self.du.embedding_size
     self.sentence_size = self.du.sentence_size
     self.memory_size = self.du.memory_size
     self.vocab_size = self.du.vocab_size
     self.vocab = self.du.vocab
     self.trainS = self.du.trainS
     self.trainQ = self.du.trainQ
     self.trainA = self.du.trainA
     self.valS = self.du.valS
     self.valQ = self.du.valQ
     self.valA = self.du.valA
     self.testS = self.du.testS
     self.testQ = self.du.testQ
     self.testA = self.du.testA
     self.train_labels = self.du.train_labels
     self.val_labels = self.du.val_labels
     self.test_labels = self.du.test_labels
     self.data_length = len(self.du.data)
Ejemplo n.º 14
0
 def setup_and_verify(self):
     parser = argparse.ArgumentParser(
         description=
         'trec 2019 deep learning track (document re-ranking task)')
     torch.set_printoptions(threshold=500)
     self.data_utils = DataUtils(self.printer)
     self.model_utils = NDRMUtils(self.printer)
     self.learner_utils = LearnerUtils(self.printer)
     self.sub_utils = [
         self.data_utils, self.model_utils, self.learner_utils
     ]
     for sub_utils in self.sub_utils:
         sub_utils.parent = self
         sub_utils.parser_add_args(parser)
     self.args = parser.parse_args()
     for sub_utils in self.sub_utils:
         sub_utils.parser_validate_args(self.args)
     self.__print_versions()
     for sub_utils in self.sub_utils:
         sub_utils.setup_and_verify()
     self.__print_args()
    def test_termination_condition(self, data_set, max_depth, feature_index_list):
        """
        Function that tests termination condition for a tree to expand.
        The termination is reached if max depth supplied is reached, or feature index list is exhausted,
        or the class is pure class.
        :param data_set:
        :param max_depth:
        :param feature_index_list:
        :return:
        """

        termination = True

        d_util = DataUtils()

        class_label, is_pure_class = d_util.get_class_label(data_set)

        if max_depth > 0 and feature_index_list is not None and len(feature_index_list) > 0 and not is_pure_class:
            termination = False

        return termination, class_label
Ejemplo n.º 16
0
# -*- coding: utf-8
import os
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.externals import joblib

from data_utils import DataUtils

cur_dir = os.path.dirname(__file__)
d_utils = DataUtils(filepath=os.path.join(cur_dir, "data/pKaInWater.csv"))
X_data, y_data_acidic, y_data_basic = d_utils.get_classification_data(
    feature_type="morgan+macc")
y_data = np.array([y_data_acidic, y_data_basic]).T

# train test split
seed = 7
X_train, X_test, y_train, y_test = train_test_split(X_data,
                                                    y_data,
                                                    test_size=0.2,
                                                    random_state=seed)
print("\n ========================= \n")
print("X_train.shape:", X_train.shape, "X_test.shape", X_test.shape)
print("\n ========================= \n")


def model_evaluation(model, x_input, y_input):
    y_pred = model.predict(x_input)
    print(classification_report(y_true=y_input, y_pred=y_pred))
Ejemplo n.º 17
0
    def train():
        alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam
        print('======== experiment settings =========')
        print('alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d' % (alpha, beta, gamma, lam, args.p, args.ws, args.ns,args.maxT,args.minT,args.max_iter, args.d))
        print('========== processing data ===========')
        dul = DataUtils(model_path)
        if args.rec:
            test_user, test_item, test_rate = dul.read_data(args.test_data)
        print("constructing graph....")
        gul = GraphUtils(model_path)
        gul.construct_training_graph(args.train_data)
        edge_dict_u = gul.edge_dict_u
        edge_list = gul.edge_list
        walk_generator(gul,args)
        print("getting context and negative samples....")
        context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples(gul, args)
        node_list_u, node_list_v = {}, {}
        init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args)
        last_loss, count, epsilon = 0, 0, 1e-3

        print("============== training ==============")
        for iter in range(0, args.max_iter):
            s1 = "\r[%s%s]%0.2f%%"%("*"* iter," "*(args.max_iter-iter),iter*100.0/(args.max_iter-1))
            loss = 0
            visited_u = dict(zip(node_list_u.keys(), [0] * len(node_list_u.keys())))
            visited_v = dict(zip(node_list_v.keys(), [0] * len(node_list_v.keys())))
            random.shuffle(edge_list)
            for i in range(len(edge_list)):
                u, v, w = edge_list[i]

                length = len(context_dict_u[u])
                random.shuffle(context_dict_u[u])
                if visited_u.get(u) < length:
                    # print(u)
                    index_list = list(range(visited_u.get(u),min(visited_u.get(u)+1,length)))
                    for index in index_list:
                        context_u = context_dict_u[u][index]
                        neg_u = neg_dict_u[u][index]
                        # center,context,neg,node_list,eta
                        for z in context_u:
                            tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u, lam, alpha)
                            node_list_u[z]['embedding_vectors'] += tmp_z
                            loss += tmp_loss
                    visited_u[u] = index_list[-1]+3

                length = len(context_dict_v[v])
                random.shuffle(context_dict_v[v])
                if visited_v.get(v) < length:
                    # print(v)
                    index_list = list(range(visited_v.get(v),min(visited_v.get(v)+1,length)))
                    for index in index_list:
                        context_v = context_dict_v[v][index]
                        neg_v = neg_dict_v[v][index]
                        # center,context,neg,node_list,eta
                        for z in context_v:
                            tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v, lam, beta)
                            node_list_v[z]['embedding_vectors'] += tmp_z
                            loss += tmp_loss
                    visited_v[v] = index_list[-1]+3

                update_u, update_v, tmp_loss = KL_divergence(edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma)
                loss += tmp_loss
                node_list_u[u]['embedding_vectors'] += update_u
                node_list_v[v]['embedding_vectors'] += update_v

            delta_loss = abs(loss - last_loss)
            if last_loss > loss:
                lam *= 1.05
            else:
                lam *= 0.95
            last_loss = loss
            if delta_loss < epsilon:
                break
            sys.stdout.write(s1)
            sys.stdout.flush()
Ejemplo n.º 18
0
def train_by_sampling(args):
    model_path = os.path.join('../', args.model_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam
    print('======== experiment settings =========')
    print(
        'alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d'
        % (alpha, beta, gamma, lam, args.p, args.ws, args.ns, args.maxT,
           args.minT, args.max_iter, args.d))
    print('========== processing data ===========')
    dul = DataUtils(model_path)
    if args.rec:
        test_user, test_item, test_rate = dul.read_data(args.test_data)
    print("constructing graph....")
    gul = GraphUtils(model_path)
    gul.construct_training_graph(
        args.train_data)  # train_data='../data/wiki/rating_train.dat'
    edge_dict_u = gul.edge_dict_u  # dict形式的点边关系
    edge_list = gul.edge_list  # list形式的点边关系
    walk_generator(gul, args)  # 生成随机游走
    print("getting context and negative samples....")
    context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples(
        gul, args)
    node_list_u, node_list_v = {}, {}
    init_embedding_vectors(node_u, node_v, node_list_u, node_list_v,
                           args)  # 初始化节点embedding
    last_loss, count, epsilon = 0, 0, 1e-3

    print("============== training ==============")
    for iter in range(0, args.max_iter):
        s1 = "\r[%s%s]%0.2f%%" % ("*" * iter, " " *
                                  (args.max_iter - iter), iter * 100.0 /
                                  (args.max_iter - 1))
        loss = 0
        visited_u = dict(zip(node_list_u.keys(),
                             [0] * len(node_list_u.keys())))  # u类别初始为0
        visited_v = dict(zip(node_list_v.keys(),
                             [0] * len(node_list_v.keys())))  # v类别初始为0
        random.shuffle(edge_list)  # edge_list: 点边信息
        for i in range(len(edge_list)):
            u, v, w = edge_list[i]

            length = len(context_dict_u[u])  # 周围邻居的数量
            random.shuffle(context_dict_u[u])
            if visited_u.get(u) < length:
                # print(u)
                index_list = list(
                    range(visited_u.get(u), min(visited_u.get(u) + 1, length)))
                for index in index_list:
                    context_u = context_dict_u[u][index]  # 选择节点的一个邻居
                    neg_u = neg_dict_u[u][
                        index]  # 选择节点的负采样信息; 负采样本身就是随机的,所以只需打乱context即可,并且多个epoch训练时,负采样样本也不同
                    # center,context,neg,node_list,eta
                    for z in context_u:  # 每一个邻居节点,都进行skip-gram更新embedding
                        tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u,
                                                    lam, alpha)
                        node_list_u[u][
                            'embedding_vectors'] += tmp_z  # 更新节点embedding
                        loss += tmp_loss
                visited_u[u] = index_list[-1] + 3

            length = len(context_dict_v[v])
            random.shuffle(context_dict_v[v])
            if visited_v.get(v) < length:
                # print(v)
                index_list = list(
                    range(visited_v.get(v), min(visited_v.get(v) + 1, length)))
                for index in index_list:
                    context_v = context_dict_v[v][index]
                    neg_v = neg_dict_v[v][index]
                    # center,context,neg,node_list,eta
                    for z in context_v:
                        tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v,
                                                    lam, beta)
                        node_list_v[v]['embedding_vectors'] += tmp_z
                        loss += tmp_loss
                visited_v[v] = index_list[-1] + 3
            # edge_dict_u:边连接的信息
            update_u, update_v, tmp_loss = KL_divergence(
                edge_dict_u, u, v, node_list_u, node_list_v, lam,
                gamma)  # 计算KL-deversion
            loss += tmp_loss
            node_list_u[u]['embedding_vectors'] += update_u
            node_list_v[v]['embedding_vectors'] += update_v
        # 求的是梯度上升,loss越大越好
        delta_loss = abs(loss - last_loss)
        if last_loss > loss:
            lam *= 1.05
        else:
            lam *= 0.95
        last_loss = loss
        if delta_loss < epsilon:
            break
        sys.stdout.write(s1)
        sys.stdout.flush()
    save_to_file(node_list_u, node_list_v, model_path, args)
    print("")
    if args.rec:
        print("============== testing ===============")
        f1, map, mrr, mndcg = top_N(test_user, test_item, test_rate,
                                    node_list_u, node_list_v, args.top_n)
        print(
            'recommendation metrics: F1 : %0.4f, MAP : %0.4f, MRR : %0.4f, NDCG : %0.4f'
            % (round(f1, 4), round(map, 4), round(mrr, 4), round(mndcg, 4)))
    if args.lip:
        print("============== testing ===============")
        auc_roc, auc_pr = link_prediction(args)
        print('link prediction metrics: AUC_ROC : %0.4f, AUC_PR : %0.4f' %
              (round(auc_roc, 4), round(auc_pr, 4)))
Ejemplo n.º 19
0
        #    if curLabel[index] in self.iterativelyTest(data, self.tree):
        #        counter += 1

        return counter / len(curLabel)


class Node(object):
    def __init__(self, isLeaf, attribute, threshold):
        self.isLeaf = isLeaf
        self.attribute = attribute
        self.threshold = threshold
        self.children = []


if __name__ == '__main__':
    from data_utils import DataUtils

    trainfile_X = 'train-images-idx3-ubyte'
    trainfile_y = 'train-labels-idx1-ubyte'
    testfile_X = 't10k-images-idx3-ubyte'
    testfile_y = 't10k-labels-idx1-ubyte'
    train_X = DataUtils(filename=trainfile_X).getImage()
    train_y = DataUtils(filename=trainfile_y).getLabel()
    test_X = DataUtils(testfile_X).getImage()
    test_y = DataUtils(testfile_y).getLabel()

    a = C45()
    a.fit(train_X, train_y)
    #a.visualize2(a.tree)
    print(a.test(test_X, test_y))
Ejemplo n.º 20
0
def step_experiment(args, max_n=5, test_size=0.3):
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(
        pos, test_size=test_size, random_state=seed)
    neg_train, neg_test = train_test_split(
        neg, test_size=test_size, random_state=seed)

    ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name)
    if args.name in ['member']:
        N_max_list = [3, 6, 9, 12]
    else:
        N_max_list = [10, 15, 20, 25, 30, 35, 40]

    if args.name in ['subtree']:
        N_beam = 15
        T_beam = 3
    else:
        N_beam = 10
        T_beam = 7

    AUCs = []
    AUC_stds = []
    MSEs = []
    MSE_stds = []
    N = 5  # how many times to perform weight learn

    naive_AUCs = []
    naive_AUC_stds = []
    naive_MSEs = []
    naive_MSE_stds = []

    for N_max in N_max_list:
        CG = ClauseGenerator(ilp_train, infer_step=args.T,
                             max_depth=1, max_body_len=1)
        solver = ILPSolver(ilp_train, C_0=clauses, CG=CG,
                           m=args.m, infer_step=args.T)
        #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T)
        clauses_, Ws_list, loss_list_list = solver.train_N(
            N=N, gen_mode='beam', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0)
        v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)

        auc_list = np.array(
            [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list])
        auc_mean = np.mean(auc_list)
        auc_std = np.std(auc_list)
        AUCs.append(auc_mean)
        AUC_stds.append(auc_std)

        mse_list = np.array(
            [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list])
        mse_mean = np.mean(mse_list)
        mse_std = np.std(mse_list)
        MSEs.append(mse_mean)
        MSE_stds.append(mse_std)

        # NAIVE
        CG = ClauseGenerator(ilp_train, infer_step=args.T,
                             max_depth=1, max_body_len=1)
        solver = ILPSolver(ilp_train, C_0=clauses, CG=CG,
                           m=args.m, infer_step=args.T)
        clauses_, Ws_list, naive_loss_list_list = solver.train_N(
            N=N, gen_mode='naive', N_max=N_max, T_beam=T_beam, N_beam=N_beam, epoch=args.epoch, lr=args.lr, wd=0.0)
        v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)

        auc_list = np.array(
            [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list])
        auc_mean = np.mean(auc_list)
        auc_std = np.std(auc_list)
        naive_AUCs.append(auc_mean)
        naive_AUC_stds.append(auc_std)

        mse_list = np.array(
            [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list])
        mse_mean = np.mean(mse_list)
        mse_std = np.std(mse_list)
        naive_MSEs.append(mse_mean)
        naive_MSE_stds.append(mse_std)

        for j in range(N):
            loss_path = 'imgs/step/loss/' + args.name + \
                '[N_max:' + str(N_max) + ']-' + str(j) + '.pdf'
            ys_list = [loss_list_list[j], naive_loss_list_list[j]]
            plot_loss_compare(loss_path, ys_list, args.name +
                              ':[N_max:' + str(N_max) + ']-' + str(j))

    path_auc = 'imgs/step/' + args.name + '_AUC.pdf'
    path_mse = 'imgs/step/' + args.name + '_MSE.pdf'
    print(AUC_stds)
    print(MSE_stds)
    labels = ['proposed', 'naive']

    plot_line_graph_compare_err(path=path_auc, xs=N_max_list, ys_list=[AUCs, naive_AUCs], err_list=[AUC_stds, naive_AUC_stds],
                                xlabel='Number of clauses', ylabel='AUC', title=args.name, labels=labels)
    plot_line_graph_compare_err(path=path_mse, xs=N_max_list, ys_list=[MSEs, naive_MSEs], err_list=[MSE_stds, naive_MSE_stds],
                                xlabel='Number of clauses', ylabel='Mean-squared test error', title=args.name, labels=labels)
Ejemplo n.º 21
0
def noise_experiment(args, test_size=0.3):
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(pos,
                                           test_size=test_size,
                                           random_state=seed)
    neg_train, neg_test = train_test_split(neg,
                                           test_size=test_size,
                                           random_state=seed)

    noise_rates = [
        0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.30, 0.35, 0.40, 0.45, 0.50
    ]
    baseline_auc = [1.0, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5]

    datasets = get_datasets_with_noise(pos_train, neg_train, noise_rates)
    AUCs = []
    AUC_stds = []
    MSEs = []
    MSE_stds = []
    N = 5  # how many times to perform weight learn

    if args.name == 'member':
        T_beam = 3
        N_beam = 3

    elif args.name == 'subtree':
        T_beam = 3
        N_beam = 15
    else:
        T_beam = 5
        N_beam = 10

    N_max = 50

    for i, (pos_train, neg_train) in enumerate(datasets):
        ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name)
        print('NOISE RATE: ', noise_rates[i])
        ilp_train.print()
        CG = ClauseGenerator(ilp_train,
                             infer_step=args.T,
                             max_depth=1,
                             max_body_len=1)
        solver = ILPSolver(ilp_train,
                           C_0=clauses,
                           CG=CG,
                           m=args.m,
                           infer_step=args.T)
        clauses_, Ws_list, loss_list_list = solver.train_N(N=N,
                                                           gen_mode='beam',
                                                           N_max=N_max,
                                                           T_beam=T_beam,
                                                           N_beam=N_beam,
                                                           epoch=args.epoch,
                                                           lr=args.lr,
                                                           wd=0.0)
        v_list, facts = solver.predict_N(pos_test, neg_test, clauses_, Ws_list)

        auc_list = np.array(
            [compute_auc(pos_test, neg_test, v_, facts) for v_ in v_list])
        auc_mean = np.mean(auc_list)
        auc_std = np.std(auc_list)
        AUCs.append(auc_mean)
        AUC_stds.append(auc_std)

        mse_list = np.array(
            [compute_mse(pos_test, neg_test, v_, facts) for v_ in v_list])
        mse_mean = np.mean(mse_list)
        mse_std = np.std(mse_list)
        MSEs.append(mse_mean)
        MSE_stds.append(mse_std)
        for j in range(N):
            loss_path = 'imgs/noise/loss/' + args.name + \
                '[noise:' + str(noise_rates[i]) + ']-' + str(j) + '.pdf'
            plot_loss(
                loss_path, loss_list_list[j],
                args.name + ':[noise:' + str(noise_rates[i]) + ']-' + str(j))

    # plot AUC with baseline
    path_auc = 'imgs/noise/' + args.name + '_AUC.pdf'
    path_mse = 'imgs/noise/' + args.name + '_MSE.pdf'

    print(AUC_stds)
    print(MSE_stds)

    plot_line_graph_baseline_err(
        path=path_auc,
        xs=noise_rates,
        ys=AUCs,
        err=AUC_stds,
        xlabel='Proportion of mislabeled training data',
        ylabel='AUC',
        title=args.name,
        baseline=baseline_auc)
    # plot MSR with std
    plot_line_graph_err(path=path_mse,
                        xs=noise_rates,
                        ys=MSEs,
                        err=MSE_stds,
                        xlabel='Proportion of mislabeled training data',
                        ylabel='Mean-squared test error',
                        title=args.name)
Ejemplo n.º 22
0
def train(args):
    model_path = os.path.join('../', args.model_name)
    if os.path.exists(model_path) is False:
        os.makedirs(model_path)
    alpha, beta, gamma, lam = args.alpha, args.beta, args.gamma, args.lam
    print('======== experiment settings =========')
    print(
        'alpha : %0.4f, beta : %0.4f, gamma : %0.4f, lam : %0.4f, p : %0.4f, ws : %d, ns : %d, maxT : % d, minT : %d, max_iter : %d, d : %d'
        % (alpha, beta, gamma, lam, args.p, args.ws, args.ns, args.maxT,
           args.minT, args.max_iter, args.d))
    print('========== processing data ===========')
    dul = DataUtils(model_path)
    if args.rec:
        test_user, test_item, test_rate = dul.read_data(args.test_data)
    print("constructing graph....")
    gul = GraphUtils(model_path)
    gul.construct_training_graph(args.train_data)
    edge_dict_u = gul.edge_dict_u
    edge_list = gul.edge_list
    walk_generator(gul, args)

    print("getting context and negative samples....")
    context_dict_u, neg_dict_u, context_dict_v, neg_dict_v, node_u, node_v = get_context_and_negative_samples(
        gul, args)
    node_list_u, node_list_v = {}, {}
    init_embedding_vectors(node_u, node_v, node_list_u, node_list_v, args)

    last_loss, count, epsilon = 0, 0, 1e-3
    print("============== training ==============")
    for iter in range(0, args.max_iter):
        s1 = "\r[%s%s]%0.2f%%" % ("*" * iter, " " *
                                  (args.max_iter - iter), iter * 100.0 /
                                  (args.max_iter - 1))
        loss = 0
        num = 0
        visited_u = dict(zip(node_list_u.keys(),
                             [0] * len(node_list_u.keys())))
        visited_v = dict(zip(node_list_v.keys(),
                             [0] * len(node_list_v.keys())))

        random.shuffle(edge_list)
        for (u, v, w) in edge_list:
            if visited_u.get(u) == 0 or random.random() > 0.95:
                # print(u)
                length = len(context_dict_u[u])
                index_list = random.sample(list(range(length)), min(length, 1))
                for index in index_list:
                    context_u = context_dict_u[u][index]
                    neg_u = neg_dict_u[u][index]
                    # center,context,neg,node_list,eta
                    for k, z in enumerate(context_u):
                        tmp_z, tmp_loss = skip_gram(u, z, neg_u, node_list_u,
                                                    lam, alpha)
                        node_list_u[z]['embedding_vectors'] += tmp_z
                        loss += tmp_loss
                visited_u[u] = 1
            if visited_v.get(v) == 0 or random.random() > 0.95:
                # print(v)
                length = len(context_dict_v[v])
                index_list = random.sample(list(range(length)), min(length, 1))
                for index in index_list:
                    context_v = context_dict_v[v][index]
                    neg_v = neg_dict_v[v][index]
                    # center,context,neg,node_list,eta
                    for k, z in enumerate(context_v):
                        tmp_z, tmp_loss = skip_gram(v, z, neg_v, node_list_v,
                                                    lam, beta)
                        node_list_v[z]['embedding_vectors'] += tmp_z
                        loss += tmp_loss
                visited_v[v] = 1
            # print(len(edge_dict_u))
            update_u, update_v, tmp_loss = KL_divergence(
                edge_dict_u, u, v, node_list_u, node_list_v, lam, gamma)
            loss += tmp_loss
            node_list_u[u]['embedding_vectors'] += update_u
            node_list_v[v]['embedding_vectors'] += update_v
            count = iter
            num += 1
        delta_loss = abs(loss - last_loss)
        if last_loss > loss:
            lam *= 1.05
        else:
            lam *= 0.95
        last_loss = loss
        if delta_loss < epsilon:
            break
        sys.stdout.write(s1)
        sys.stdout.flush()
    save_to_file(node_list_u, node_list_v, model_path, args)
    print("")
    if args.rec:
        print("============== testing ===============")
        f1, map, mrr, mndcg = top_N(test_user, test_item, test_rate,
                                    node_list_u, node_list_v, args.top_n)
        print(
            'recommendation metrics: F1 : %0.4f, MAP : %0.4f, MRR : %0.4f, NDCG : %0.4f'
            % (round(f1, 4), round(map, 4), round(mrr, 4), round(mndcg, 4)))
    if args.lip:
        print("============== testing ===============")
        auc_roc, auc_pr = link_prediction(args)
        print('link prediction metrics: AUC_ROC : %0.4f, AUC_PR : %0.4f' %
              (round(auc_roc, 4), round(auc_pr, 4)))
Ejemplo n.º 23
0
program_start = time()
root_path = r"D:\FF120\workspace\Python\data\MINIST"  #MINIST文件路径
save_to_disk = False  #是否生成图像保存到硬盘

if os.path.exists(root_path + "\\train_X.npy"):
    train_X = np.load(root_path + '\\train_X.npy')
    train_y = np.load(root_path + '\\train_y.npy')
    test_X = np.load(root_path + '\\test_X.npy')
    test_y = np.load(root_path + '\\test_y.npy')
else:
    trainfile_X = root_path + '\\train-images.idx3-ubyte'
    trainfile_y = root_path + '\\train-labels.idx1-ubyte'
    testfile_X = root_path + '\\t10k-images.idx3-ubyte'
    testfile_y = root_path + '\\t10k-labels.idx1-ubyte'

    train_X = DataUtils(filename=trainfile_X).getImage()
    train_y = DataUtils(filename=trainfile_y).getLabel()
    test_X = DataUtils(testfile_X).getImage()
    test_y = DataUtils(testfile_y).getLabel()

    np.save(root_path + "\\train_X.npy", train_X)
    np.save(root_path + "\\train_y.npy", train_y)
    np.save(root_path + "\\test_X.npy", test_X)
    np.save(root_path + "\\test_y.npy", test_y)

#以下内容是将图像保存到本地文件中
if save_to_disk:
    t0 = time.time()
    path_trainset = root_path + "\\imgs_train"
    path_testset = root_path + "\\imgs_test"
    if not os.path.exists(path_trainset):
Ejemplo n.º 24
0
def softor_experiment(args, max_n=5, test_size=0.3):
    du = DataUtils(args.name)
    pos, neg, bk, clauses, lang = du.load_data()
    pos_train, pos_test = train_test_split(pos,
                                           test_size=test_size,
                                           random_state=7014)
    neg_train, neg_test = train_test_split(neg,
                                           test_size=test_size,
                                           random_state=7014)

    ilp_train = ILPProblem(pos_train, neg_train, bk, lang, name=args.name)

    N_max = 50

    if args.name in ['member']:
        T_beam = 3
        N_beam = 3
        m = 3

    elif args.name in ['subtree']:
        T_beam = 3
        N_beam = 15
        m = 4

    else:
        T_beam = 5
        N_beam = 10
        m = 3

    CG = ClauseGenerator(ilp_train,
                         infer_step=args.T,
                         max_depth=1,
                         max_body_len=1)
    solver = ILPSolver(ilp_train,
                       C_0=clauses,
                       CG=CG,
                       m=args.m,
                       infer_step=args.T)
    #solver = ILPSolver(ilp_train, C_0=clauses, m=args.m, infer_step=args.T, im_mode='softmax')
    clauses_, Ws_, loss_list, times = solver.train_time(gen_mode='beam',
                                                        N_max=N_max,
                                                        T_beam=T_beam,
                                                        N_beam=N_beam,
                                                        epoch=args.epoch,
                                                        lr=args.lr,
                                                        wd=0.0)
    print('Ws: ')
    for W in Ws_:
        print(F.softmax(W, dim=0))
    v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_)
    auc = compute_auc(pos_test, neg_test, v_, facts)
    mse = compute_mse(pos_test, neg_test, v_, facts)
    ent = compute_ent(Ws_, gen_mode='softmax')
    print('ENT:', ent)

    print('AUC:', auc)

    df = {}
    df['AUC'] = auc
    df['N_params'] = solver.count_params()
    df['time'] = mean(times)
    df['std'] = stdev(times)
    df['MSE'] = mse
    df['ENT'] = compute_ent(Ws_, gen_mode='softmax')

    path = 'results/' + args.name + '_softor' + '.txt'
    save(path, df)

    # PAIR
    CG = ClauseGenerator(ilp_train,
                         infer_step=args.T,
                         max_depth=1,
                         max_body_len=1)
    solver = ILPSolver(ilp_train,
                       C_0=clauses,
                       CG=CG,
                       m=args.m,
                       infer_step=args.T,
                       im_mode='pair')
    #solver = ILPSolver(ilp_train, C_0=clauses, m=2, infer_step=args.T, im_mode='pair')
    clauses_, Ws_, pair_loss_list, times = solver.train_time(gen_mode='beam',
                                                             N_max=N_max,
                                                             T_beam=T_beam,
                                                             N_beam=N_beam,
                                                             epoch=args.epoch,
                                                             lr=args.lr,
                                                             wd=0.0)
    print('Ws: ')
    print(softmax2d(Ws_[0]))
    v_, facts = solver.predict(pos_test, neg_test, clauses_, Ws_)
    auc = compute_auc(pos_test, neg_test, v_, facts)
    mse = compute_mse(pos_test, neg_test, v_, facts)

    df = {}
    df['AUC_pair'] = auc
    df['N_params_pair'] = solver.count_params()
    df['time_pair'] = mean(times)
    df['std_pair'] = stdev(times)
    df['MSE_pair'] = mse
    df['ENT_pair'] = compute_ent(Ws_, gen_mode='pair')

    path = 'results/' + args.name + '_pair' + '.txt'
    save(path, df)
    print(df)

    loss_path = 'imgs/softor/loss/' + args.name + '.pdf'
    ys_list = [loss_list, pair_loss_list]
    plot_loss_compare(loss_path, ys_list, args.name)
Ejemplo n.º 25
0
 def __init__(self):
     self.data_utils = DataUtils()
     self.transfer_learner = TransferLearner()
     self.analyser = Analyser()
Ejemplo n.º 26
0
def run(epoches, batch_size, order, learnrate, learnrate2, decay, decay2):

    trainfile_X = 'train-images-idx3-ubyte'
    trainfile_y = 'train-labels-idx1-ubyte'
    testfile_X = 't10k-images-idx3-ubyte'
    testfile_y = 't10k-labels-idx1-ubyte'

    train_X = DataUtils(filename=trainfile_X).getImage()
    train_y = DataUtils(filename=trainfile_y).getLabel()
    test_X = DataUtils(testfile_X).getImage()
    test_y = DataUtils(testfile_y).getLabel()

    train_y = normalize(train_y)
    test_y = normalize(test_y)

    train_X = np.expand_dims(train_X, axis=-1)
    test_X = np.expand_dims(test_X, axis=-1)
    train_y = np.expand_dims(train_y, axis=-1)
    test_y = np.expand_dims(test_y, axis=-1)

    epoches = epoches
    batch_size = batch_size
    batches = int(len(train_X) / batch_size)
    order = order

    mu = 0.0
    sigma = 0.1
    image_d = 784
    neuron_d_1 = 256
    neuron_d_2 = 128
    neuron_d_output = 10

    dimension_1 = [neuron_d_1, image_d]
    weights_1 = create_weights(mu, sigma, dimension_1)
    bias_1 = np.zeros((neuron_d_1, 1))

    dimension_2 = [neuron_d_2, neuron_d_1]
    weights_2 = create_weights(mu, sigma, dimension_2)
    bias_2 = np.zeros((neuron_d_2, 1))

    dimension_3 = [neuron_d_output, neuron_d_2]
    weights_3 = create_weights(mu, sigma, dimension_3)
    bias_3 = np.zeros((neuron_d_output, 1))

    weights = [weights_1, weights_2, weights_3]
    bias = [bias_1, bias_2, bias_3]

    for epoch in range(epoches):
        learnrate -= learnrate * decay
        learnrate2 -= learnrate2 * decay2

        #train_X, train_y = shuffle(train_X,train_y,random_state=0)

        for i in range(batches):
            loss = 0
            accuracy = 0
            start = i * batch_size
            end = (i + 1) * batch_size
            x = np.concatenate(train_X[start:end], axis=-1)
            y = np.concatenate(train_y[start:end], axis=-1)

            layer_i, layer_o = forward(x, weights, bias)
            layer_b_w, layer_b = backward(x, y, layer_i, layer_o, order,
                                          weights, bias)

            if order == 1:
                weights_3 = np.add(weights_3, learnrate * layer_b_w[0])
                weights_2 = np.add(weights_2, learnrate * layer_b_w[1])
                weights_1 = np.add(weights_1, learnrate * layer_b_w[2])
                bias_3 = np.mean(np.add(bias_3, learnrate * layer_b[0]),
                                 axis=-1,
                                 keepdims=True)
                bias_2 = np.mean(np.add(bias_2, learnrate * layer_b[1]),
                                 axis=-1,
                                 keepdims=True)
                bias_1 = np.mean(np.add(bias_1, learnrate * layer_b[2]),
                                 axis=-1,
                                 keepdims=True)

            elif order == 2:
                weights_3 = np.add(
                    np.add(weights_3, learnrate * layer_b_w[0][0]),
                    -learnrate2 * layer_b_w[0][1])
                weights_2 = np.add(
                    np.add(weights_2, learnrate * layer_b_w[1][0]),
                    -learnrate2 * layer_b_w[1][1])
                weights_1 = np.add(
                    np.add(weights_1, learnrate * layer_b_w[2][0]),
                    -learnrate2 * layer_b_w[2][1])
                bias_3 = np.mean(np.add(
                    np.add(bias_3, learnrate * layer_b[0][0]),
                    -learnrate2 * layer_b[0][1]),
                                 axis=-1,
                                 keepdims=True)
                bias_2 = np.mean(np.add(
                    np.add(bias_2, learnrate * layer_b[1][0]),
                    -learnrate2 * layer_b[1][1]),
                                 axis=-1,
                                 keepdims=True)
                bias_1 = np.mean(np.add(
                    np.add(bias_1, learnrate * layer_b[2][0]),
                    -learnrate2 * layer_b[2][1]),
                                 axis=-1,
                                 keepdims=True)

            weights = [weights_1, weights_2, weights_3]
            bias = [bias_1, bias_2, bias_3]

            loss = sum(sum(abs(np.subtract(y, layer_o[2]))))
            for col in range(batch_size):
                accuracy += int(layer_o[2][:, col].argmax() == y[:,
                                                                 col].argmax())
            accuracy = accuracy / batch_size
            print('epoch:{}, batch:{} , loss:{}, accuracy:{}'.format(
                epoch, i, loss, accuracy))
            accuracy = 0

    accuracy = 0
    X = np.concatenate(test_X, axis=-1)
    Y = np.concatenate(test_y, axis=-1)
    _, output = forward(X, weights, bias)
    for i in range(len(Y[0])):
        accuracy += int(output[2][:, i].argmax() == Y[:, i].argmax())
    print('test accuracy:{}'.format(float(accuracy / len(Y[0]))))

    del weights
    del bias

    return accuracy
from data_utils import DataUtils

dataUtils = DataUtils()

if __name__ == '__main__':

    dataUtils.pre_process_aws(0.01)
    dataUtils.pre_process_aws(0.1)
    dataUtils.pre_process_aws(1)

    dataUtils.pre_process_col_tweets()
Ejemplo n.º 28
0
# coding = utf-8
import torch
import time

from cvae import ContinuousAgent, ContinuousVAE
from data_utils import DataUtils
from config import RunConfig as Config

if __name__ == "__main__":
    config = Config()
    api = DataUtils(config.ctx_encode_method)
    api.load_vocab()
    api.load_candidates()
    api.load_dialog(config.coming_task, config.system_mode)
    api.build_pad_config(config.memory_size)

    model = ContinuousVAE(config, api)

    if config.trained_model is not None:
        print("Using trained model in {}".format(config.trained_model))
        model.load_state_dict(torch.load(config.trained_model))

    agent = ContinuousAgent(config, model, api)

    t1 = time.time()
    agent.main()
    t2 = time.time()
    print("cost time: {} seconds.".format(t2 - t1))