Beispiel #1
0
def human_test(filename, sample_size, random_seed=None, filter_fn=None):
    """
    Presents a textual interface that allows human accuracy to be evaluated on the given data. A human will be asked
    to classify a given comment into a specific subreddit.
    :param filename: Str, a filename as a path
    :param sample_size: Number of Comment instances to use for each cross validation set.
    :param random_seed: A integer value that can be used to ensure stable random generation of the data set.
    :param filter_fn: A function that can be used to filter which Comments are included in the training data on a per
    comment basis.
    :return: Accuracy of correctly predicted comment => subreddit labels.
    """

    data = DataSet(
        [comment for comment in load_comments(filename, sample_size)],
        random_seed, filter_fn).generate_human(filter_fn)
    correct = 0
    potential_labels = list(set(data[1]))

    for i in range(0, len(data[0])):
        print(data[0][i])

        for j in range(0, len(potential_labels)):
            print("[{}]: {}".format(j, potential_labels[j]))
        choice = input("Enter the # corresponding to the correct subreddit: ")
        if potential_labels[choice] == data[1][i]:
            correct += 1

    return {"human_average": correct / float(len(data[0]))}
Beispiel #2
0
def evaluate(filename,
             sample_size,
             classifiers,
             n_cross_validation=5,
             random_seed=None,
             filter_fn=None,
             verbose=False):
    """
    Evaluates the given list of Classifier instances using n cross validation.
    :param filename: Str, a filename as a path
    :param sample_size: Number of Comment instances to use for each cross validation set.
    :param classifiers: List of Classifier instances which will each be fitted on the training data and scored on
    the testing data.
    :param n_cross_validation: Number of cross validation sets to generate during evaluation of each Classifier.
    :param random_seed: A integer value that can be used to ensure stable random generation of the data set.
    :param filter_fn: A function that can be used to filter which Comments are included in the training data on a per
    comment basis.
    :param verbose: If enabled output will be given detailing the progress on the evaluation.
    :return: Tuple (data set size, Dict{ classifier results: Dict{ n cross result accuracy, average accuracy } })
    """

    sets = DataSet(
        [comment
         for comment in load_comments(filename, sample_size)], random_seed,
        filter_fn).generate_n_cross_validation_sets(n_cross_validation)

    if verbose:
        print("Finished generating cross validation sets")

    results = {}

    import numpy as np
    for classifier in classifiers:
        result = [
            classifier.fit(set['training_sparse_matrix'],
                           set['training_labels']).score(
                               set['validation_sparse_matrix'],
                               set['validation_labels']) for set in sets
        ],
        results[type(classifier).__name__] = {
            "result": result,
            "average": np.mean(result)
        }
        if verbose:
            print("Finished testing {}".format(type(classifier).__name__))

    return (sets[0]['size'], results)
    def load_data(self, filepath):
        self.DataSet = DataSet.fromTrainTest(filepath)
        mult = self.DataSet.trackword_config["TargetPrecision"]["full"] - \
                self.DataSet.trackword_config["TargetPrecision"]["int"]
        self.DataSet.X_train["b_trk_inv2R"] = self.DataSet.X_train["b_trk_inv2R"] * \
                (2**(mult-self.DataSet.trackword_config["InvR"]["nbits"]))
        self.DataSet.X_train["b_trk_cot"] = self.DataSet.X_train["b_trk_cot"] * \
                (2**(mult-self.DataSet.trackword_config["Cot"]["nbits"]))
        self.DataSet.X_train["b_trk_zT"] = self.DataSet.X_train["b_trk_zT"] * \
                (2**(mult-self.DataSet.trackword_config["ZT"]["nbits"]))
        self.DataSet.X_train["b_trk_phiT"] = self.DataSet.X_train["b_trk_phiT"] * \
                (2**(mult-self.DataSet.trackword_config["PhiT"]["nbits"]))
        # ==============================================================================
        self.DataSet.X_test["b_trk_inv2R"] = self.DataSet.X_test["b_trk_inv2R"] * \
                (2**(mult-self.DataSet.trackword_config["InvR"]["nbits"]))
        self.DataSet.X_test["b_trk_cot"] = self.DataSet.X_test["b_trk_cot"] * \
                (2**(mult-self.DataSet.trackword_config["Cot"]["nbits"]))
        self.DataSet.X_test["b_trk_zT"] = self.DataSet.X_test["b_trk_zT"] * \
                (2**(mult-self.DataSet.trackword_config["ZT"]["nbits"]))
        self.DataSet.X_test["b_trk_phiT"] = self.DataSet.X_test["b_trk_phiT"] * \
                (2**(mult-self.DataSet.trackword_config["PhiT"]["nbits"]))

        for k in range(4):
            self.DataSet.X_test["b_stub_r_" + str(k)] = self.DataSet.X_test["b_stub_r_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["r"]["nbits"]))
            self.DataSet.X_test["b_stub_phi_" + str(k)] = self.DataSet.X_test["b_stub_phi_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["phi"]["nbits"]))
            self.DataSet.X_test["b_stub_z_" + str(k)] = self.DataSet.X_test["b_stub_z_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["z"]["nbits"]))
            self.DataSet.X_test["b_stub_dPhi_" + str(k)] = self.DataSet.X_test["b_stub_dPhi_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["dPhi"]["nbits"]))
            self.DataSet.X_test["b_stub_dZ_" + str(k)] = self.DataSet.X_test["b_stub_dZ_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["dZ"]["nbits"]))
            # ==============================================================================
            self.DataSet.X_train["b_stub_r_" + str(k)] = self.DataSet.X_train["b_stub_r_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["r"]["nbits"]))
            self.DataSet.X_train["b_stub_phi_" + str(k)] = self.DataSet.X_train["b_stub_phi_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["phi"]["nbits"]))
            self.DataSet.X_train["b_stub_z_" + str(k)] = self.DataSet.X_train["b_stub_z_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["z"]["nbits"]))
            self.DataSet.X_train["b_stub_dPhi_" + str(k)] = self.DataSet.X_train["b_stub_dPhi_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["dPhi"]["nbits"]))
            self.DataSet.X_train["b_stub_dZ_" + str(k)] = self.DataSet.X_train["b_stub_dZ_" + str(k)] * \
                (2**(mult-self.DataSet.trackword_config["dZ"]["nbits"]))
Beispiel #4
0
from Disperse_WHEEL import Disperse_Wheel
from OLH import OLH
#封装的函数获取epsilon
EPS = 1
M = 1  #每个用户的数据多少
getm_usernum = 30000  #获取80%的m使用的用户数量
D = 1  #domain

#获取80%用户的长度
get_m = Get_M(EPS, getm_usernum)
get_m.load_data_random('kosarak.dat')
M = get_m.get_m()
#####################

#获取全部的数据
Data1 = DataSet()
Data1.create_data_random('kosarak.dat', 0)
max_num = Data1.max_valued()
#D=max_num+1

#数据集进行编号,用map存储
map_data = {}
Data1.transform_to_map(map_data)
map_length = len(map_data)
D = map_length + 1
#####################

#数据等长处理,对不够长的进行填充,更新map_data表
Data2 = DataSet()
Data2.create_data_certain('kosarak.dat', M, getm_usernum, D, map_data)
numbered_data = []  #这里将其转化为编完号的表,可以直接使用这个
Beispiel #5
0
def train():
    LEARNING_RATE = 1e-3
    BATCH_SIZE = 64
    PRETRAIN_EPOCH = 1000
    PRETRAIN_EPOCH_d = 1100
    feature_nums = 15549
    dropout_value = 0.9
    dropout_sign = 1.0
    train_datapath = r"F:/project/simulation_data/drop60_p.train"
    EPOCH = 2500
    # outDir = r"F:/project/simulation_data/drop60/bn_"
    model_name = "AE-GAN_bn_dp_0.9_0_separate"
    load_checkpoint = False
    outDir = os.path.join("F:/project/simulation_data/drop60", model_name)
    model = "separate"

    x = tf.placeholder(tf.float32, [None, feature_nums], name="input_data")
    # completion = tf.placeholder(tf.float32, [BATCH_SIZE, feature_nums])
    is_training = tf.placeholder(tf.bool, [], name="is_training")
    # completed = tf.placeholder(tf.float32,[None, feature_nums], name="generator_out")
    mask = tf.placeholder(tf.float32, [None, feature_nums], name="input_data")

    model = Simple_separate(x,
                            mask,
                            is_training,
                            batch_size=BATCH_SIZE,
                            feature_num=feature_nums,
                            dropout_value=dropout_value,
                            dropout_sign=dropout_sign,
                            is_bn=True)
    sess = tf.Session()
    global_step = tf.Variable(0, name='global_step', trainable=False)
    epoch = tf.Variable(0, name='epoch', trainable=False)

    with tf.name_scope("adam_optimizer"):
        opt = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
        gv_train_op = opt.minimize(model.gv_loss,
                                   global_step=global_step,
                                   var_list=model.g_variables)
        g_train_op = opt.minimize(model.g_loss,
                                  global_step=global_step,
                                  var_list=model.g_variables)
        d_train_op = opt.minimize(model.d_loss,
                                  global_step=global_step,
                                  var_list=model.d_variables)

    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    saver = tf.train.Saver()

    load_model_dir = os.path.join('./backup', model_name)
    if load_checkpoint and os.path.exists('./backup/' + model_name):
        ckpt = tf.train.get_checkpoint_state(load_model_dir)
        if ckpt and ckpt.model_checkpoint_path:
            saver.restore(sess, ckpt.model_checkpoint_path)
    elif os.path.exists(load_model_dir):
        shutil.rmtree(load_model_dir)
    else:
        os.makedirs(load_model_dir)

    logs_dir = os.path.join("./logs", model_name)
    if load_checkpoint is False and os.path.exists(logs_dir):
        shutil.rmtree(logs_dir)
        os.makedirs(logs_dir)
    writer = tf.summary.FileWriter(logs_dir, sess.graph)

    # if tf.train.get_checkpoint_state('./backup'):
    #     saver = tf.train.Saver()
    #     saver.restore(sess, './backup/latest')
    #
    # logs_dir = os.path.join("./logs", model_name)
    # if os.path.exists(logs_dir) == False:
    #     os.makedirs(logs_dir)
    # writer = tf.summary.FileWriter(logs_dir, sess.graph)

    dataset = DataSet(train_datapath, BATCH_SIZE)
    # each epoch has step_num steps
    step_num = dataset.steps

    while sess.run(epoch) < EPOCH:
        sess.run(tf.assign(epoch, tf.add(epoch, 1)))
        print('epoch: {}'.format(sess.run(epoch)))

        # Completion
        if sess.run(epoch) <= PRETRAIN_EPOCH:
            for i in tqdm.tqdm(range(step_num)):
                x_batch = dataset.next()
                _, gv_loss, gv_summary_str = sess.run(
                    [gv_train_op, model.gv_loss, model.gv_sum],
                    feed_dict={
                        x: x_batch,
                        is_training: True,
                        K.learning_phase(): 1
                    })
                if i % 10 == 0:
                    writer.add_summary(gv_summary_str)

            print('Completion loss: {}'.format(gv_loss))
            if sess.run(epoch) % 100 == 0:
                saver.save(sess,
                           load_model_dir + '/pretrained_g',
                           write_meta_graph=False)
            if sess.run(epoch) == PRETRAIN_EPOCH:
                dataset = DataSet(train_datapath,
                                  BATCH_SIZE,
                                  onepass=True,
                                  shuffle=False)
                imitate_datas = []
                complete_datas = []
                embed_datas = []
                for i in tqdm.tqdm(range(step_num + 1)):
                    x_batch = dataset.next()
                    mask = x_batch == 0
                    embed, imitation, completion = sess.run(
                        [
                            model.encoderv_out, model.imitation,
                            model.completion
                        ],
                        feed_dict={
                            x: x_batch,
                            is_training: False,
                            K.learning_phase(): 0
                        })
                    completion = np.array(completion, dtype=np.float)
                    imitation = np.array(imitation, dtype=np.float)
                    embed = np.array(embed, dtype=np.float)
                    mask = mask.astype(float)
                    completion = x_batch * (1 - mask) + completion * mask
                    imitation = x_batch * (1 - mask) + imitation * mask
                    complete_datas.append(completion)
                    imitate_datas.append(imitation)
                    embed_datas.append(embed)

                dataset = DataSet(train_datapath, BATCH_SIZE)
                complete_datas = np.reshape(
                    np.concatenate(complete_datas, axis=0), (-1, feature_nums))
                imitate_datas = np.reshape(
                    np.concatenate(imitate_datas, axis=0), (-1, feature_nums))
                embed_datas = np.reshape(np.concatenate(embed_datas, axis=0),
                                         (-1, feature_nums // 32))
                df_c = pd.DataFrame(complete_datas)
                df_i = pd.DataFrame(imitate_datas)
                df_e = pd.DataFrame(embed_datas)
                if os.path.exists(outDir) == False:
                    os.makedirs(outDir)
                # outPath = os.path.join(outDir, "infer.complete")
                df_c.to_csv(outDir + "generator.imitate", index=None)
                df_i.to_csv(outDir + "generator.complete", index=None)
                df_e.to_csv(outDir + "generator.embed", index=None)
                print("save complete data to {}".format(outDir +
                                                        "infer.complete"))
                saver.save(sess,
                           load_model_dir + '/pretrained_g',
                           write_meta_graph=False)

        # Discrimitation
        elif sess.run(epoch) <= PRETRAIN_EPOCH_d:
            for i in tqdm.tqdm(range(step_num)):
                x_batch = dataset.next()
                _, d_loss, d_summary_str = sess.run(
                    [d_train_op, model.d_loss, model.d_sum],
                    feed_dict={
                        x: x_batch,
                        is_training: True,
                        K.learning_phase(): 1
                    })
                if i % 10 == 0:
                    writer.add_summary(d_summary_str)

            print('Discriminator loss: {}'.format(d_loss))
            if sess.run(epoch) % 100 == 0:
                saver = tf.train.Saver()
                saver.save(sess,
                           load_model_dir + '/pretrained_d',
                           write_meta_graph=False)

        # together
        elif sess.run(epoch) < EPOCH:
            for i in tqdm.tqdm(range(step_num)):
                x_batch = dataset.next()
                _, d_loss, d_summary_str = sess.run(
                    [d_train_op, model.d_loss, model.d_sum],
                    feed_dict={
                        x: x_batch,
                        is_training: True,
                        K.learning_phase(): 1
                    })
                if i % 10 == 0:
                    writer.add_summary(d_summary_str)

                _, g_loss, g_summary_str = sess.run(
                    [g_train_op, model.g_loss, model.g_sum],
                    feed_dict={
                        x: x_batch,
                        is_training: True,
                        K.learning_phase(): 1
                    })
                if i % 10 == 0:
                    writer.add_summary(g_summary_str)

            print('Completion loss: {}'.format(g_loss))
            print('Discriminator loss: {}'.format(d_loss))
            if sess.run(epoch) % 100 == 0:
                saver = tf.train.Saver()
                saver.save(sess,
                           load_model_dir + '/latest',
                           write_meta_graph=False)

        elif sess.run(epoch) == EPOCH:
            dataset = DataSet(train_datapath,
                              BATCH_SIZE,
                              onepass=True,
                              shuffle=False)
            imitate_datas = []
            complete_datas = []
            embed_datas = []
            for i in tqdm.tqdm(range(step_num + 1)):
                x_batch = dataset.next()
                mask = x_batch == 0
                embed, imitation, completion = sess.run(
                    [model.encoderv_out, model.imitation, model.completion],
                    feed_dict={
                        x: x_batch,
                        is_training: False,
                        K.learning_phase(): 0
                    })
                completion = np.array(completion, dtype=np.float)
                imitation = np.array(imitation, dtype=np.float)
                embed = np.array(embed, dtype=np.float)
                mask = mask.astype(float)
                completion = x_batch * (1 - mask) + completion * mask
                imitation = x_batch * (1 - mask) + imitation * mask
                complete_datas.append(completion)
                imitate_datas.append(imitation)
                embed_datas.append(embed)
            # saver = tf.train.Saver()
            # saver.save(sess, load_model_dir+'/complete', write_meta_graph=False)

            complete_datas = np.reshape(np.concatenate(complete_datas, axis=0),
                                        (-1, feature_nums))
            imitate_datas = np.reshape(np.concatenate(imitate_datas, axis=0),
                                       (-1, feature_nums))
            embed_datas = np.reshape(np.concatenate(embed_datas, axis=0),
                                     (-1, feature_nums // 32))
            df_c = pd.DataFrame(complete_datas)
            df_i = pd.DataFrame(imitate_datas)
            df_e = pd.DataFrame(embed_datas)
            if os.path.exists(outDir) == False:
                os.makedirs(outDir)
            # outPath = os.path.join(outDir, "infer.complete")
            df_c.to_csv(outDir + "infer.imitate", index=None)
            df_i.to_csv(outDir + "infer.complete", index=None)
            df_e.to_csv(outDir + "infer.embed", index=None)
            print("save complete data to {}".format(outDir))
Beispiel #6
0
                                  "features/overlap." + name + ".npy")
    X_refuting = gen_or_load_feats(refuting_features, h, b,
                                   "features/refuting." + name + ".npy")
    X_polarity = gen_or_load_feats(polarity_features, h, b,
                                   "features/polarity." + name + ".npy")
    X_hand = gen_or_load_feats(hand_features, h, b,
                               "features/hand." + name + ".npy")

    X = np.c_[X_hand, X_polarity, X_refuting, X_overlap]
    return X, y


if __name__ == "__main__":

    #Load the training dataset and generate folds
    d = DataSet()
    folds, hold_out = kfold_split(d, n_folds=10)
    fold_stances, hold_out_stances = get_stances_for_folds(d, folds, hold_out)
    print("length of fold stances :--------> ", len(fold_stances))

    # Load the competition dataset
    competition_dataset = DataSet("competition_test")
    X_competition, y_competition = generate_features(
        competition_dataset.stances, competition_dataset, "competition")

    Xs = dict()
    ys = dict()

    # Load/Precompute all features now
    X_holdout, y_holdout = generate_features(hold_out_stances, d, "holdout")
    for fold in fold_stances:
Beispiel #7
0
    def train(self, config):

        begin = time.clock()
        dataset = DataSet(config.train_datapath, config.batch_size)
        test_dataset = DataSet(config.train_datapath,
                               config.batch_size,
                               shuffle=False,
                               onepass=True)
        # dataset = dataset
        # test_dataset = dataset
        create_conf = self.create_conf
        steps = dataset.steps * config.epoch
        print("total {} steps...".format(steps))

        # sample_dirs = os.path.join("samples", self.model_name)

        # for dir in [sample_dirs, log_dirs]:
        #   if os.path.exists(dir) == False:
        #     os.makedirs(dir)

        # gpu_conf = tf.ConfigProto()
        # gpu_conf.gpu_options.per_process_gpu_memory_fraction = config.gpu_ratio

        with tf.Session() as session:

            log_dirs = os.path.join("./logs", self.model_name)
            if os.path.exists(log_dirs) == False:
                os.makedirs(log_dirs)

            load_model_dir = os.path.join('./backup', self.model_name)
            if config.load_checkpoint and os.path.exists(load_model_dir):
                self.load(session, load_model_dir)
            elif os.path.exists(load_model_dir):
                shutil.rmtree(load_model_dir)

            if config.load_checkpoint is False and os.path.exists(log_dirs):
                shutil.rmtree(log_dirs)
                os.makedirs(log_dirs)

            self.writer = tf.summary.FileWriter(log_dirs, session.graph)

            tf.global_variables_initializer().run()

            # sample_batch = dataset.sample_batch()
            # sample_mask = np.float32(sample_batch > 0.0)
            # sample_path = os.path.join(sample_dirs, "{}.sample".format(self.model_name))
            # pd.DataFrame(sample_batch).to_csv(sample_path, index=False)

            for step in range(steps + 1):

                batch_data = dataset.next()
                mask = (batch_data == 0.0)
                # mask = np.float32(mask)

                if step % config.save_freq_steps != 0:
                    _, loss, mse_loss, sparse_loss, rank_loss = session.run(
                        [
                            self.apply_grads, self.loss, self.mse_loss,
                            self.sparse_loss, self.rank_loss
                        ],
                        feed_dict={
                            self.X: batch_data,
                            self.mask: mask,
                            self.is_training: True,
                            K.learning_phase(): 1
                        })
                else:
                    _, summary_str, loss, mse_loss, sparse_loss, rank_loss = session.run(
                        [
                            self.apply_grads, self.merged_summary_op,
                            self.loss, self.mse_loss, self.sparse_loss,
                            self.rank_loss
                        ],
                        feed_dict={
                            self.X: batch_data,
                            self.mask: mask,
                            self.is_training: True,
                            K.learning_phase(): 1
                        })

                if step % config.log_freq_steps == 0:
                    print(
                        "step {}th, loss: {}, mse_loss: {}, sparse_loss: {}, rank_loss: {}"
                        .format(step, loss, mse_loss, sparse_loss, rank_loss))

                if step % config.save_freq_steps == 0:
                    self.writer.add_summary(summary_str, step)
                    # save_dir = os.path.join(config.checkpoint_dir, self.model_name)
                    self.save(session, load_model_dir, step)

            self.predict_tmp(session, steps, test_dataset, config)

        end = time.clock()
        print("training {} cost time: {} mins".format(self.model_name,
                                                      (end - begin) / 60.0))
Beispiel #8
0
 def load_data(self, filepath):
     self.DataSet = DataSet.fromTrainTest(filepath)
     self.train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
         self.DataSet.X_train, label="trk_fake")
     self.test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(
         self.DataSet.X_test, label="trk_fake")
Beispiel #9
0
 def load_data(self, filepath):
     self.DataSet = DataSet.fromTrainTest(filepath)
     self.dtrain = xgb.DMatrix(self.DataSet.X_train.to_numpy(),
                               label=np.ravel(self.DataSet.y_train))
     self.dtest = xgb.DMatrix(self.DataSet.X_test.to_numpy(),
                              label=np.ravel(self.DataSet.y_test))
Beispiel #10
0
p = Parameters(NN='CNN',
               period=24,
               n_filters=64,
               n_dense=256,
               n_cl=6,
               batch_size=256,
               n_epochs=50,
               learning_rate=0.0005,
               loss_type='mean_squared_error')

print(datestr)
titolo = str(p.str) + '_date-' + datestr
print(titolo)

train = DataSet('Dataset/ARS_DLR_DataSet_V2.mat',
                seed=1,
                parameters=p,
                reduced_dataset=True)
test = DataSet('Dataset/ARS_DLR_Benchmark_Data_Set.mat',
               seed=1,
               parameters=p,
               reduced_dataset=True,
               labtab=train.lab_tab)

assert (str(train.lab_tab) == str(test.lab_tab))

model = MLModel(train, test, titolo)

model.init(n_filters=p.n_filters,
           n_dense=p.n_dense,
           learning_rate=p.learning_rate,
           loss_type=p.loss_type)
Beispiel #11
0
def create_dataset(file, attrnames, target, values):
    examples = formatfile(file)
    attrs = [k for k in range(len(examples[0]))]  # index of examples
    inputs = create_input(attrs, target)
    return DataSet(file, examples, inputs, attrs, target, attrnames, values)
Beispiel #12
0
import datetime
import numpy as np
from Settings import Config
from Dataset import DataSet
from network import GATQA
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pickle

os.environ['CUDA_VISIBLE_DEVICES'] = '6,7'
FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_boolean('train', True, 'set True to train')
conf_ = Config()
domain = conf_.domain
dataset = DataSet()
traindata = pickle.load(
    open('./data/' + domain + '/' + domain + '-train.pkl', 'rb'))
testdata = pickle.load(
    open('./data/' + domain + '/' + domain + '-test.pkl', 'rb'))


def evaluation(y_pred, y_true):
    f1_s = f1_score(y_true, y_pred, average='macro')
    accuracy_s = accuracy_score(y_true, y_pred)
    return f1_s, accuracy_s


def train(sess, setting):
    with sess.as_default():
        initializer = tf.contrib.layers.xavier_initializer()
if __name__ == '__main__':
    args = parse_args()
    num_epochs = args.epochs
    batch_size = args.batch_size
    mf_dim = args.num_factors
    layers = eval(args.layers)
    num_negatives = args.num_neg
    atten_prob = args.atten_prob

    topK = 10
    evaluation_threads = 1  # mp.cpu_count()
    print("NeuMF arguments: %s " % args)

    # Loading data
    t1 = time()
    dataset = DataSet(args.path + args.dataset)
    train, testRatings, testNegatives = dataset.trainMatrix, dataset.testRatings, dataset.testNegatives
    num_users, num_items = train.shape

    print("Load data done [%.1f s]. #user=%d, #item=%d, #train=%d, #test=%d" %
          (time() - t1, num_users, num_items, train.nnz, len(testRatings)))

    # Build model
    model = get_model(num_users, num_items, mf_dim, layers, atten_prob)

    model.compile(optimizer=Adam(), loss='binary_crossentropy')

    # Init performance
    (hits, ndcgs) = evaluate_model(model, testRatings, testNegatives, topK,
                                   evaluation_threads)
    hr, ndcg = np.array(hits).mean(), np.array(ndcgs).mean()
Beispiel #14
0
    data = scaler.fit_transform(df)
    df = pd.DataFrame(data=data, columns=df.columns)
columns = list(df.columns)
if FLAGS.method is not None:
    columns = select_gene(columns,
                          df,
                          method=FLAGS.method,
                          feature_sel=FLAGS.feature_sel,
                          save=False)
    df = df[columns]
data = df.values
samples, feature_nums = data.shape
train_size = np.int(np.round(samples * 0.8))
val_size = samples - train_size
df_shuffle = df.sample(frac=1).reset_index(drop=True)
df_train = df_shuffle.iloc[0:train_size]
df_val = df_shuffle.iloc[train_size:]

train_dataset = DataSet(df_train, FLAGS.batch_size)
val_dataset = DataSet(df_val, FLAGS.batch_size, shuffle=False)
test_dataset = DataSet(df, FLAGS.batch_size, shuffle=False)

model = AutoEncoder(feature_nums,
                    num_clusters=FLAGS.num_clusters,
                    beta=FLAGS.beta,
                    dropout=FLAGS.dropout,
                    learning_rate=FLAGS.learning_rate,
                    activation=FLAGS.activation,
                    model_name=FLAGS.model_name)
model.train(train_dataset, val_dataset, test_dataset, columns, FLAGS)
Beispiel #15
0
def predict(LEARNING_RATE1,
            LEARNING_RATE2,
            PRETRAIN_EPOCH,
            PRETRAIN_EPOCH_d,
            dropout_encoder,
            dropout_decoder,
            EPOCH,
            model_name,
            is_bn,
            is_log,
            is_mask,
            is_binary,
            is_dependent,
            is_after,
            is_before,
            is_bn_d,
            is_approximate,
            model="separate",
            load_checkpoint=False,
            train_datapath=r"F:/project/simulation_data/drop60_p.train",
            feature_nums=15549,
            dropout_sign=1.0):

    BATCH_SIZE = 64
    outDir = os.path.join("F:/project/simulation_data/drop60", model_name)
    model = "separate"

    x = tf.placeholder(tf.float32, [None, feature_nums], name="input_data")
    # completion = tf.placeholder(tf.float32, [BATCH_SIZE, feature_nums])
    is_training = tf.placeholder(tf.bool, [], name="is_training")
    # completed = tf.placeholder(tf.float32,[None, feature_nums], name="generator_out")
    learning_rate = tf.placeholder(tf.float32, shape=[])

    model_dict = {
        "simple": Simple_model,
        "approximate": Softgum_app,
        "separate": Simple_separate
    }

    Network = model_dict[model]
    model = Network(x,
                    is_training,
                    batch_size=BATCH_SIZE,
                    feature_num=feature_nums,
                    dropout_value=dropout_value,
                    dropout_sign=dropout_sign,
                    is_bn=is_bn,
                    is_log=is_log,
                    is_mask=is_mask,
                    is_binary=is_binary,
                    is_dependent=is_dependent,
                    is_after=is_after,
                    is_before=is_before,
                    is_approximate=is_approximate)
    sess = tf.Session()
    global_step = tf.Variable(0, name='global_step', trainable=False)
    epoch = tf.Variable(0, name='epoch', trainable=False)

    with tf.name_scope("adam_optimizer"):
        opt = tf.train.AdamOptimizer(learning_rate=learning_rate)
        gv_train_op = opt.minimize(model.gv_loss,
                                   global_step=global_step,
                                   var_list=model.g_variables)
        g_train_op = opt.minimize(model.g_loss,
                                  global_step=global_step,
                                  var_list=model.g_variables)
        d_train_op = opt.minimize(model.d_loss,
                                  global_step=global_step,
                                  var_list=model.d_variables)

    init_op = tf.global_variables_initializer()
    sess.run(init_op)
    saver = tf.train.Saver()

    load_model_dir = os.path.join('./backup', model_name)
    ckpt = tf.train.get_checkpoint_state(load_model_dir)
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)

    #each epoch has step_num steps

    dataset = DataSet(train_datapath, BATCH_SIZE, onepass=True, shuffle=False)
    step_num = dataset.steps
    imitate_datas = []
    complete_datas = []
    embed_datas = []
    for i in tqdm.tqdm(range(step_num + 1)):
        x_batch = dataset.next()
        mask = x_batch == 0
        embed, imitation, completion = sess.run(
            [model.encoderv_out, model.imitation, model.completion],
            feed_dict={
                x: x_batch,
                is_training: False,
                learning_rate: LEARNING_RATE2,
                K.learning_phase(): 0
            })
        completion = np.array(completion, dtype=np.float)
        imitation = np.array(imitation, dtype=np.float)
        embed = np.array(embed, dtype=np.float)
        mask = mask.astype(float)
        completion = x_batch * (1 - mask) + completion * mask
        imitation = x_batch * (1 - mask) + imitation * mask
        complete_datas.append(completion)
        imitate_datas.append(imitation)
        embed_datas.append(embed)

    complete_datas = np.reshape(np.concatenate(complete_datas, axis=0),
                                (-1, feature_nums))
    imitate_datas = np.reshape(np.concatenate(imitate_datas, axis=0),
                               (-1, feature_nums))
    embed_datas = np.reshape(np.concatenate(embed_datas, axis=0),
                             (-1, feature_nums // 32))
    df_c = pd.DataFrame(complete_datas)
    df_i = pd.DataFrame(imitate_datas)
    df_e = pd.DataFrame(embed_datas)
    if os.path.exists(outDir) == False:
        os.makedirs(outDir)
    # outPath = os.path.join(outDir, "infer.complete")
    df_c.to_csv(outDir + "infer.imitate", index=None)
    df_i.to_csv(outDir + "infer.complete", index=None)
    df_e.to_csv(outDir + "infer.embed", index=None)
    print("save complete data to {}".format(outDir))
 def load_data(self, filepath):
     self.DataSet = DataSet.fromTrainTest(filepath)
     print(self.DataSet)