Python Dataset Beispiele, util.dataset.Dataset Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: custom_model.py Projekt: Sbarbagnem/User_Identify_Inertial_Sensor

    def create_dataset(self, run_colab, colab_path):
        if self.magnitude:
            channel = self.configuration.config[
                self.dataset_name]['WINDOW_AXES'] + len(
                    list(self.configuration.config[self.dataset_name]
                         ['SENSOR_DICT'].keys()))
        else:
            channel = self.configuration.config[
                self.dataset_name]['WINDOW_AXES']

        path = self.configuration.config[
            self.dataset_name]['PATH_OUTER_PARTITION']

        # joint to path of drive data
        if run_colab:
            path = colab_path + ''.join(path.split('.')[1:])

        if '128' not in self.outer_dir:
            winlen = self.configuration.config[
                self.dataset_name]['WINDOW_SAMPLES']
        else:
            winlen = 128

        self.winlen = winlen

        self.dataset = Dataset(path=path,
                               channel=channel,
                               winlen=winlen,
                               user_num=self.configuration.config[
                                   self.dataset_name]['NUM_CLASSES_USER'],
                               act_num=self.configuration.config[
                                   self.dataset_name]['NUM_CLASSES_ACTIVITY'],
                               outer_dir=self.outer_dir)

Beispiel #2

0

Datei anzeigen

Datei: corpus.py Projekt: jq2276/Learning2Copy

 def transform(self, data_file, batch_size,
               data_type="test", shuffle=False, device=None):
     """
     Transform raw text from data_file to Dataset and create data loader.
     """
     raw_data = self.read_data(data_file, data_type=data_type)
     examples = self.build_examples(raw_data)
     data = Dataset(examples)
     data_loader = data.create_batches(batch_size, shuffle, device)
     return data_loader

Beispiel #3

0

Datei anzeigen

Datei: corpus.py Projekt: jq2276/Learning2Copy

 def load_data(self, prepared_data_file=None):
     """
     load_data
     """
     prepared_data_file = prepared_data_file or self.prepared_data_file
     print("Loading prepared data from {} ...".format(prepared_data_file))
     data = torch.load(prepared_data_file)
     self.data = {"train": Dataset(data['train']),
                  "valid": Dataset(data["valid"]),
                  "test": Dataset(data["test"])}
     print("Number of examples:",
           " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))

Beispiel #4

0

Datei anzeigen

def train(args):
    """
    trains the reading comprehension model
    """
    logger = logging.getLogger("brc")

    logger.info('check the directories...')
    for dir_path in [
            os.path.join(args.model_dir, args.data_type),
            os.path.join(args.result_dir, args.data_type),
            os.path.join(args.summary_dir, args.data_type)
    ]:
        if not os.path.exists(dir_path):
            logger.warning(
                "don't exist {} directory, so we create it!".format(dir_path))
            os.makedirs(dir_path)

    # data_type 容易和 data files 不一致，此处判断下
    for f in args.train_files + args.dev_files + args.test_files:
        if args.data_type not in f:
            raise ValueError('Inconsistency between data_type and files')

    logger.info('Load data_set and vocab...')
    vocab_path = os.path.join(args.vocab_dir, args.data_type, args.vocab_file)
    with open(vocab_path, 'rb') as fin:
        logger.info('load vocab from {}'.format(vocab_path))
        vocab = pickle.load(fin)
    brc_data = Dataset(
        args.max_p_num,
        args.max_p_len,
        args.max_q_len,
        args.max_a_len,
        train_answer_len_cut_bins=args.train_answer_len_cut_bins,
        train_files=args.train_files,
        dev_files=args.dev_files,
        badcase_sample_log_file=args.badcase_sample_log_file)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab, args.use_oov2unk)
    logger.info('Initialize the model...')
    rc_model = MultiAnsModel(vocab, args)
    logger.info('Training the model...')
    rc_model.train_and_evaluate_several_batchly(
        data=brc_data,
        epochs=args.epochs,
        batch_size=args.batch_size,
        evaluate_cnt_in_one_epoch=args.evaluate_cnt_in_one_epoch,
        save_dir=os.path.join(args.model_dir, args.data_type),
        save_prefix=args.desc + args.algo)
    logger.info('Done with model training!')

Beispiel #5

0

Datei anzeigen

def load_dataset(splits=('train', 'dev', 'test'), domains='all', strict=False,
                 base_path=None, elmo=False):
    """

    :param splits:
    :param domains: filter for domains (if 'all', use all available)
    :param strict: if True, select only dialogs that contain only a single domain
    :return:
    """
    path = base_path if base_path else dann
    # TODO implement filtering with `domains` and `strict`
    with open(os.path.join(path, 'ontology.json')) as f:
        ontology = Ontology.from_dict(json.load(f))
    with open(os.path.join(path, 'vocab.json')) as f:
        vocab = Vocab.from_dict(json.load(f))
    with open(os.path.join(path, 'emb.json')) as f:
        E = json.load(f)

    w2v = {w: E[i] for i, w in enumerate(vocab.to_dict()['index2word'])}

    dataset = {}
    for split in splits:
        with open(os.path.join(path, '{}.json'.format(split))) as f:
            logging.warn('loading split {}'.format(split))
            dataset[split] = Dataset.from_dict(json.load(f))

    logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()})))
    return dataset, ontology, vocab, w2v

Beispiel #6

0

Datei anzeigen

    def collect_demo(self):        
        dataset = Dataset(self.max_length)
        for idx, _df in enumerate(self.df_list):
            t = list()
            for idx in range(_df.shape[0]-1):  # before termination?
                t.append(transition(
                    obs=self._pos2state(np.around(_df['f1'][idx], self.around_digit), np.around(_df['f2'][idx], self.around_digit)), 
                    act=self.inv_action_idx[(np.around(_df['a1'][idx+1], self.around_digit), np.around(_df['a2'][idx+1], self.around_digit))],
                    next_obs=self._pos2state(np.around(_df['f1'][idx+1], self.around_digit), np.around(_df['f2'][idx+1], self.around_digit)),
                    rew=1.0))
            dataset.append(t)
        
            self.goal_states.append(self._pos2state(np.around(_df['f1'][_df.shape[0]-1], self.around_digit), 
                                                    np.around(_df['f2'][_df.shape[0]-1], self.around_digit)))

        return dataset

Beispiel #7

0

Datei anzeigen

Datei: corpus.py Projekt: jq2276/Learning2Copy

    def reload(self, data_type='test'):
        data_file = os.path.join(self.data_dir, self.data_prefix + "." + data_type)
        data_raw = self.read_data(data_file, data_type="test")
        data_examples = self.build_examples(data_raw)
        self.data[data_type] = Dataset(data_examples)

        print("Number of examples:",
              " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))

Beispiel #8

0

Datei anzeigen

Datei: commons.py Projekt: Sanyam07/XAI-Analytics

def get_dataset(id: str) -> (Dataset, str):
    """
    Get a dataset from the built-in datasets.
    :param id: The id (must be equal to the Datasets enum name) of the dataset
    :return: A fully loaded dataset, A message for the user
    """
    dataset = Dataset.built_in(id)
    msg = "Dataset \'{} ({})\' loaded successfully. For further information about this dataset please visit: {}"\
        .format(dataset.id.name, dataset.name, dataset.url)
    log.info(msg)
    log.info("\n{}".format(dataset.df.head()))

    return dataset, msg

Beispiel #9

0

Datei anzeigen

Datei: commons.py Projekt: Sanyam07/XAI-Analytics

def get_dataset(name: str, url: str) -> (Dataset, str):
    """
    Get a dataset from an URL (external source).
    :param name: The name of the dataset.
    :param url: The URL from which the dataset should be (down-)loaded
    :return: A fully loaded dataset, A message for the user
    """
    dataset = Dataset.from_url(name, url)
    msg = "Dataset \'{} ({})\' loaded successfully. For further information about this dataset please visit: {}"\
        .format(dataset.id.name, dataset.name, dataset.url)
    log.info(msg)
    log.info("\n{}".format(dataset.df.head()))

    return dataset, msg

Beispiel #10

0

Datei anzeigen

def evaluate(args):
    """
    evaluate the trained model on dev files
    """
    logger = logging.getLogger("brc")
    logger.info('Load data_set and vocab...')
    vocab_path = os.path.join(args.vocab_dir, args.data_type, args.vocab_file)
    with open(vocab_path, 'rb') as fin:
        logger.info('load vocab from {}'.format(vocab_path))
        vocab = pickle.load(fin)
    assert len(args.dev_files) > 0, 'No dev files are provided.'

    # data_type 容易和 data files 不一致，此处判断下
    for f in args.train_files + args.dev_files + args.test_files:
        if args.data_type not in f:
            raise ValueError('Inconsistency between data_type and files')

    brc_data = Dataset(args.max_p_num,
                       args.max_p_len,
                       args.max_q_len,
                       dev_files=args.dev_files,
                       badcase_sample_log_file=args.badcase_sample_log_file)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab, args.use_oov2unk)
    logger.info('Build the model...')
    rc_model = MultiAnsModel(vocab, args)
    logger.info('restore model from {}, with prefix {}'.format(
        os.path.join(args.model_dir, args.data_type), args.desc + args.algo))
    rc_model.restore(model_dir=os.path.join(args.model_dir, args.data_type),
                     model_prefix=args.desc + args.algo)
    logger.info('Evaluating the model on dev set...')
    dev_batches = brc_data.gen_mini_batches('dev',
                                            args.batch_size,
                                            pad_id=vocab.get_id(
                                                vocab.pad_token),
                                            shuffle=False)
    total_batch_count = brc_data.get_data_length(
        'dev') // args.batch_size + int(
            brc_data.get_data_length('dev') % args.batch_size != 0)
    dev_loss, dev_bleu_rouge = rc_model.evaluate(total_batch_count,
                                                 dev_batches,
                                                 result_dir=os.path.join(
                                                     args.result_dir,
                                                     args.data_type),
                                                 result_prefix='dev.predicted')
    logger.info('Loss on dev set: {}'.format(dev_loss))
    logger.info('Result on dev set: {}'.format(dev_bleu_rouge))
    logger.info('Predicted answers are saved to {}'.format(
        os.path.join(args.result_dir)))

Beispiel #11

0

Datei anzeigen

def generate_dataset_elmo(elmo, splits=('train', 'dev', 'test'), domains='all', strict=False,
                          base_path=None):
    """
    """
    path = base_path if base_path else ''
    with open(os.path.join(path, 'ontology.json')) as f:
        ontology = Ontology.from_dict(json.load(f))

    dataset = {}
    for split in splits:
        with open(os.path.join(path, '{}.json'.format(split))) as f:
            logging.warn('loading split {}'.format(split))
            data = Dataset.from_dict(json.load(f))
            #data.dialogues = data.dialogues[:500]
            data.to_elmo(elmo)
            dataset[split] = data

    logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()})))
    return dataset, ontology

Beispiel #12

0

Datei anzeigen

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from models.linear_regression import LinearRegressionModel
from util.dataset import Dataset

num_centers = 2
lineres = 100

X, y = make_blobs(n_samples=30, n_features=2, centers=num_centers, center_box=(-10.0, 10.0))


print(X)
print(y)
dataset = Dataset(X, y)
xdata = dataset.get_feature(0)
ydata = dataset.get_feature(1)
xmin = np.min(xdata) - 3
xmax = np.max(xdata) + 3
ymin = np.min(ydata) - 3
ymax = np.max(ydata) + 3

regression_model = LinearRegressionModel(2)
regression_model.fit(dataset)

xline = [xmin, xmax]
yline = [regression_model.predict([xmin]), regression_model.predict([xmax])]



plt.figure()

Beispiel #13

0

Datei anzeigen

Datei: self_play.py Projekt: ssi379/irelia

common.set_flags()
common.make_dirs(os.path.join(FLAGS.save_dir, "dataset_ready"))

env = Game.make("KoreanChess-v1",
                {"use_check": False, "limit_step": FLAGS.max_step, "print_mcts_history": FLAGS.print_mcts_history,
                 "use_color_print": FLAGS.use_color_print, "use_cache": FLAGS.use_cache})

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
model = Model(sess, weight_decay=FLAGS.weight_decay, momentum=FLAGS.momentum, num_layers=FLAGS.num_model_layers,
              use_cache=FLAGS.use_cache, conf=FLAGS)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()

ds = Dataset(sess)

while True:
    common.restore_model(FLAGS.save_dir, "best_model.ckpt", saver, sess)
    now = common.now_date_str_nums()
    dataset_path = os.path.join(FLAGS.save_dir, ("dataset_%s_%s.csv" % (now, uuid.uuid4())))
    ds.open(dataset_path)
    game_results = {"b": 0, "r": 0, "d": 0}
    episode = 0
    while True:
        """"""
        """self-play"""
        log("self-play episode %d" % episode)
        info, state_history, mcts_history = play.self_play(env, model, FLAGS.max_simulation, FLAGS.max_step,
                                                           FLAGS.c_puct, FLAGS.exploration_step, FLAGS.reuse_mcts,
                                                           FLAGS.print_mcts_tree, FLAGS.num_state_history,

Beispiel #14

0

Datei anzeigen

Datei: exp_emb_mine1.py Projekt: code-7308/hybrid-kge

p = Parameters()
params = p.get_parameters()

workspace = Workspace(params.w, params.em, params.exp_id)

if os.path.exists(workspace.base):
    shutil.rmtree(workspace.base)

os.makedirs(workspace.base)
os.makedirs(workspace.result_dir)

# fresh start generate train data from raw for mining and embedding
dataset = Dataset(
    workspace,
    train_ratio=params.tr_ratio,
    shuffle=True,
    load_existing_test_files=params.load_existing_test_files,
    load_existing_test_files_sparsity=params.load_existing_test_files_sparsity)

global_iters = params.g_iters

print(str(params))
params_dump_file = open(workspace.base + "/params.txt", "w")
n = params_dump_file.write(str(params))
params_dump_file.close()

for iter_id in range(global_iters):
    print("Global Iter: ", iter_id)

    # run embedding model
    if iter_id == 0:

Beispiel #15

0

Datei anzeigen

Datei: self_play_and_train.py Projekt: ssi379/irelia

sess = tf.Session(config=config)
writer = tf.summary.FileWriter(FLAGS.save_dir + '/summary', sess.graph)
model = Model(sess,
              weight_decay=FLAGS.weight_decay,
              momentum=FLAGS.momentum,
              num_layers=FLAGS.num_model_layers,
              use_cache=FLAGS.use_cache,
              conf=FLAGS)
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver()
learning_rate = FLAGS.learning_rate

common.restore_model(FLAGS.save_dir, FLAGS.model_file_name, saver, sess)

dataset_path = os.path.join(FLAGS.save_dir, "dataset.csv")
ds = Dataset(sess)
ds.open(dataset_path)
game_results = {"b": 0, "r": 0, "d": 0}
wins = 0
for episode in range(FLAGS.max_episode):
    """"""
    """self-play"""
    print("self-play episode %d" % episode)
    info, state_history, mcts_history = play.self_play(
        env, model, FLAGS.max_simulation, FLAGS.max_step, FLAGS.c_puct,
        FLAGS.exploration_step, FLAGS.reuse_mcts, FLAGS.print_mcts_tree,
        FLAGS.num_state_history, FLAGS.print_mcts_search)

    if info["winner"]:
        game_results[info["winner"]] += 1
        wins += 1

Beispiel #16

0

Datei anzeigen

Datei: train_nn.py Projekt: yikedouer/NJU_KBQA

    # print "loading train data..."
    # x_u, x_r, y, _ = util.load_data(config.train_file, True, config.neg_sample)
    # train_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict)
    # print np.array(train_dataset.ques_idx).shape, np.array(train_dataset.rela_idx).shape, np.array(
    #     train_dataset.label).shape
    #
    # print "loading dev data..."
    # x_u, x_r, y, _ = util.load_data(config.dev_file, True, config.neg_sample)
    # dev_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict)
    # print np.array(dev_dataset.ques_idx).shape, np.array(dev_dataset.rela_idx).shape, np.array(dev_dataset.label).shape

    print("loading test data...")
    x_u, x_r, y, _ = util.load_data(config.test_file, False,
                                    config.num_classes)
    print(np.array(x_u).shape, np.array(x_r).shape, np.array(y).shape)
    print(x_u[0], x_r[0], y[0])
    test_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict)
    print(
        np.array(test_dataset.ques_idx).shape,
        np.array(test_dataset.rela_idx).shape,
        np.array(test_dataset.label).shape)

    # print "training..."
    # train_nn(train_dataset, dev_dataset, config.max_sent_len, embedding)

    print("testing...")
    test_nn(test_dataset, config.max_sent_len, embedding)

    end = time.time()
    print('total time: %s' % str(end - start))

Beispiel #17

0

Datei anzeigen

        print("time {}, test loss {:g}, train acc {:g}".format(
            end - start, test_loss / test_set.size,
            test_correct_num / test_set.size))


if __name__ == "__main__":
    start = time.time()

    print("loading word embedding...")
    word_dict, embedding = util.get_pretrained_word_vector(
        config.word2vec_file, (config.voc_size, config.emb_size))
    print("vocabulary size: %d" % len(word_dict))

    print("loading train data...")
    x_u, x_r, y, _ = util.load_data(config.train_file, True, config.neg_sample)
    train_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict)
    print(
        np.array(train_dataset.ques_idx).shape,
        np.array(train_dataset.rela_idx).shape,
        np.array(train_dataset.label).shape)
    print("train dataset length:")
    print(train_dataset.ques_lens, train_dataset.rela_lens)

    print("loading dev data...")
    x_u, x_r, y, _ = util.load_data(config.dev_file, True, config.neg_sample)
    dev_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict)
    print(
        np.array(dev_dataset.ques_idx).shape,
        np.array(dev_dataset.rela_idx).shape,
        np.array(dev_dataset.label).shape)

Beispiel #18

0

Datei anzeigen

def prepare(args):
    """
    checks data, creates the directories, prepare the vocabulary and embeddings
    """
    logger = logging.getLogger()
    logger.info('Checking the data files...')
    for data_path in args.train_files + args.dev_files + args.test_files:
        assert os.path.exists(data_path), '{} file does not exist.'.format(
            data_path)
    logger.info('Preparing the directories...')

    for dir_path in [
            os.path.join(args.vocab_dir, args.data_type),
            os.path.join(args.model_dir, args.data_type),
            os.path.join(args.result_dir, args.data_type),
            os.path.join(args.summary_dir, args.data_type)
    ]:
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

    # data_type 容易和 data files 不一致，此处判断下
    for f in args.train_files + args.dev_files + args.test_files:
        if args.data_type not in f:
            raise ValueError('Inconsistency between data_type and files')

    if args.create_vocab:
        logger.info('load train dataset...')
        brc_data = Dataset(
            args.max_p_num,
            args.max_p_len,
            args.max_q_len,
            args.max_a_len,
            train_answer_len_cut_bins=args.train_answer_len_cut_bins,
            train_files=args.train_files,
            badcase_sample_log_file=args.badcase_sample_log_file)
        logger.info('Building vocabulary...')
        vocab = Vocab(
            init_random=args.initial_tokens_random,
            trainable_oov_cnt_threshold=args.trainable_oov_cnt_threshold)
        for word in brc_data.word_iter('train'):
            vocab.add(word)

        unfiltered_vocab_size = vocab.size()
        vocab.filter_tokens_by_cnt(min_cnt=args.vocab_min_cnt)
        filtered_num = unfiltered_vocab_size - vocab.size()
        logger.info(
            'After filter {} tokens, the final vocab size is {}'.format(
                filtered_num, vocab.size()))

        logger.info('Assigning embeddings...')
        if args.pretrained_word_path is not None:
            logger.info('load the pretrained word embeddings...')
            vocab.build_embedding_matrix(args.pretrained_word_path)
        else:
            logger.info('random init word embeddings...')
            vocab.randomly_init_embeddings(args.embed_size)

        logger.info('Saving vocab...')
        vocab_path = os.path.join(args.vocab_dir, args.data_type,
                                  args.vocab_file)
        with open(vocab_path, 'wb') as fout:
            pickle.dump(vocab, fout)

    logger.info('Done with preparing!')

Beispiel #19

0

Datei anzeigen

    len(labeled_data), len(unlabeled_data), len(dev_data)))

# Tokenizing
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

labeled_texts = [data[0] for data in labeled_data]
labeled_labels = [data[1] for data in labeled_data]

if args.do_augment is True:
    augmented_texts, augmented_labels = back_translate(labeled_texts,
                                                       labeled_labels)
    labeled_texts.extend(augmented_texts)
    labeled_labels.extend(augmented_labels)

labeled_encodings = tokenizer(labeled_texts, truncation=True, padding=True)
labeled_dataset = Dataset(labeled_encodings, labeled_labels)

dev_texts = [data[0] for data in dev_data]
dev_labels = [data[1] for data in dev_data]
dev_encodings = tokenizer(dev_texts, truncation=True, padding=True)
dev_dataset = Dataset(dev_encodings, dev_labels)

test_encodings = tokenizer(test_texts, truncation=True, padding=True)
test_dataset = Dataset(test_encodings, test_labels)

# We keep the label of unlabeled data to track for accuracy of pseudo-labeling
unlabeled_texts = [data[0] for data in unlabeled_data]
unlabeled_labels = [data[1] for data in unlabeled_data]
unlabeled_encodings = tokenizer(unlabeled_texts, truncation=True, padding=True)
unlabeled_dataset = Dataset(unlabeled_encodings, unlabeled_labels)

Beispiel #20

0

Datei anzeigen

Datei: preprocess_data.py Projekt: coastalcph/dialog-rl

    if missing_files(draw, splits):
        if not os.path.isdir(draw):
            os.makedirs(draw)

    if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']):
        if not os.path.isdir(dann):
            os.makedirs(dann)
        dataset = {}
        ontology = Ontology()
        vocab = Vocab()
        vocab.word2index(['<sos>', '<eos>'], train=True)
        for s in splits:
            fname = '{}.json'.format(s)
            logging.warn('Annotating {}'.format(s))
            dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname))
            dataset[s].numericalize_(vocab)
            ontology = ontology + dataset[s].extract_ontology()
            with open(os.path.join(dann, fname), 'wt') as f:
                json.dump(dataset[s].to_dict(), f)
        ontology.numericalize_(vocab)
        with open(os.path.join(dann, 'ontology.json'), 'wt') as f:
            json.dump(ontology.to_dict(), f)
        with open(os.path.join(dann, 'vocab.json'), 'wt') as f:
            json.dump(vocab.to_dict(), f)

        logging.warn('Computing word embeddings')
        embeddings = [GloveEmbedding(), KazumaCharEmbedding()]
        E = []
        for w in tqdm(vocab._index2word):
            e = []

Beispiel #21

0

Datei anzeigen

 def self_train(self, labeled_dataset, unlabeled_dataset, guide_type=None, confidence_threshold=0.9):
     best_accuracy = -1
     min_dev_loss = 987654321
     
     print(len(unlabeled_dataset))
     print(type(unlabeled_dataset))
     
     for outer_epoch in range(self.config.epochs):
         sampled_num = len(unlabeled_dataset) // 2
         random.shuffle(unlabeled_dataset)            
         sampled_unlabeled = unlabeled_dataset[:sampled_num]
         
         sampled_text = [data[0] for data in sampled_unlabeled]
         sampled_labels = [data[1] for data in sampled_unlabeled]
         sampled_encodings = self.tokenizer(sampled_text, truncation=True, padding=True)
         sampled_unlabeled_dataset = Dataset(sampled_encodings, sampled_labels)
         
         print('outer_epoch {} sampled unlabeled dataset {}'.format(outer_epoch, len(sampled_unlabeled_dataset)))
         
         # pseudo-labeling
         new_dataset = self.pseudo_labeling(sampled_unlabeled_dataset, confidence_threshold, guide_type)
         
         # add pseudo-label into labeled data
         combined_dataset, new_dataset = self.add_dataset(labeled_dataset, new_dataset)
         
         # remove pseudo-label from unlabeled data
         # unlabeled_dataset = self.remove_dataset(unlabeled_dataset, new_dataset)
         
         self.train_loader = DataLoader(combined_dataset, **self.config.train_params)
         self.early_stopping = EarlyStopping(patience=5, verbose=True)
         
         # re-initialize the student model from scratch
         del self.model, self.optimizer
         if self.model_type =='baseline':
             self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.config.class_num).to(self.config.device)
             self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2e-5)
         else:
             self.model = BERT_ATTN(num_labels=self.config.class_num).to(self.config.device)
             self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2e-5)
             
         # retrain model with labeled data + pseudo-labeled data
         best_dev_acc = -1
         for inner_epoch in range(self.config.epochs):
             print('outer_epoch {} inner_epoch {} best_accuracy {}'.format(outer_epoch, inner_epoch, best_accuracy))
             self.train_epoch(inner_epoch)
             dev_loss, dev_acc = self.evaluator.evaluate(self.model, self.valid_loader)
             self.early_stopping(dev_loss)
             
             # save model when current dev_acc is greater than best_dev_acc 
             if dev_acc > best_dev_acc:
                 best_dev_acc = dev_acc
                 if self.model_type =='baseline':
                     self.model.save_pretrained(self.ssl_path)
                 else:
                     self.lexicon = copy.deepcopy(self.lexicon_temp)
                     torch.save({'model_state_dict':self.model.state_dict(),
                                 'optimizer_state_dict':self.optimizer.state_dict(),
                                 'epoch': {'outer_epoch':outer_epoch, 'inner_epoch':inner_epoch}},
                                self.ssl_path +'/checkpoint.pt')
                     
             if inner_epoch % 1 == 0:
                 test_loss, test_acc = self.evaluator.evaluate(self.model, self.test_loader, is_test=True)
                 if best_accuracy < test_acc:
                     best_accuracy = test_acc
             
             if self.model_type != 'baseline':
                 self.lexicon_temp = {label:{} for label in range(self.config.class_num)}
             
             if self.early_stopping.early_stop:
                 print("Early Stopping!")
                 break
                 
     print('Best accuracy {}'.format(best_accuracy))

Beispiel #22

0

Datei anzeigen

 def encode_dataset(self, texts, labels):
     encodings = self.tokenizer(texts, truncation=True, padding=True)
     dataset = Dataset(encodings, labels)
     return dataset

Beispiel #23

0

Datei anzeigen

Datei: custom_model.py Projekt: Sbarbagnem/User_Identify_Inertial_Sensor

class Model():
    def __init__(self,
                 dataset_name,
                 configuration_file,
                 multi_task,
                 lr,
                 model_type,
                 fold_test,
                 save_dir='log',
                 outer_dir='OuterPartition/',
                 overlap=5.0,
                 magnitude=False,
                 init_lr=0.001,
                 drop_factor=0.5,
                 drop_epoch=10,
                 path_best_model='',
                 log=False):
        self.dataset_name = dataset_name
        self.configuration = configuration_file
        self.multi_task = multi_task
        self.lr = lr
        self.init_lr = init_lr
        self.drop_factor = drop_factor
        self.drop_epoch = drop_epoch
        self.overlap = overlap
        self.model_type = model_type
        self.epochs = configuration_file.EPOCHS
        self.num_act = configuration_file.config[dataset_name][
            'NUM_CLASSES_ACTIVITY']
        self.num_user = configuration_file.config[dataset_name][
            'NUM_CLASSES_USER']
        self.batch_size = configuration_file.BATCH_SIZE
        self.sensor_dict = configuration_file.config[dataset_name][
            'SENSOR_DICT']
        self.fold_test = fold_test
        self.log = log
        self.magnitude = magnitude
        if magnitude:
            self.axes = self.configuration.config[
                self.dataset_name]['WINDOW_AXES'] + len(
                    list(self.configuration.config[self.dataset_name]
                         ['SENSOR_DICT'].keys()))
        else:
            self.axes = self.configuration.config[
                self.dataset_name]['WINDOW_AXES']
        # to see performance on user identification based on activity done
        self.history_act_true = []
        self.history_act_pred = []
        self.history_user_true = []
        self.history_user_pred = []
        self.outer_dir = outer_dir
        self.magnitude = magnitude
        self.train_log_dir = "{}/{}/{}/{}/batch_{}/lr_{}/over_{}/fold_{}/{}/train".format(
            save_dir, self.model_type, self.dataset_name,
            'multi_task' if self.multi_task else 'single_task',
            self.batch_size, self.lr, str(overlap), self.fold_test,
            datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
        self.val_log_dir = "{}/{}/{}/{}/batch_{}/lr_{}/over_{}/fold_{}/{}/val".format(
            save_dir, self.model_type, self.dataset_name,
            'multi_task' if self.multi_task else 'single_task',
            self.batch_size, self.lr, str(overlap), self.fold_test,
            datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
        self.train_writer = tf.summary.create_file_writer(self.train_log_dir)
        self.val_writer = tf.summary.create_file_writer(self.val_log_dir)
        self.final_pred_right_act = [0 for _ in np.arange(0, self.num_act)]
        self.final_pred_wrong_act = [0 for _ in np.arange(0, self.num_act)]
        self.best_model = None

    def create_dataset(self, run_colab, colab_path):
        if self.magnitude:
            channel = self.configuration.config[
                self.dataset_name]['WINDOW_AXES'] + len(
                    list(self.configuration.config[self.dataset_name]
                         ['SENSOR_DICT'].keys()))
        else:
            channel = self.configuration.config[
                self.dataset_name]['WINDOW_AXES']

        path = self.configuration.config[
            self.dataset_name]['PATH_OUTER_PARTITION']

        # joint to path of drive data
        if run_colab:
            path = colab_path + ''.join(path.split('.')[1:])

        if '128' not in self.outer_dir:
            winlen = self.configuration.config[
                self.dataset_name]['WINDOW_SAMPLES']
        else:
            winlen = 128

        self.winlen = winlen

        self.dataset = Dataset(path=path,
                               channel=channel,
                               winlen=winlen,
                               user_num=self.configuration.config[
                                   self.dataset_name]['NUM_CLASSES_USER'],
                               act_num=self.configuration.config[
                                   self.dataset_name]['NUM_CLASSES_ACTIVITY'],
                               outer_dir=self.outer_dir)

    def load_data(self, only_acc=False, only_acc_gyro=False, realdisp=False):

        # gat data [examples, window_samples, axes, channel]
        if realdisp:
            TrainData, TrainLA, TrainLU, TrainDI, ValidData, ValidLA, ValidLU, ValidDI, TestData, TestLA, TestLU, TestDI = self.dataset.load_data(
                fold_test=self.fold_test,
                overlapping=self.overlap,
                realdisp=realdisp)
        else:
            TrainData, TrainLA, TrainLU, ValidData, ValidLA, ValidLU, TestData, TestLA, TestLU = self.dataset.load_data(
                fold_test=self.fold_test,
                overlapping=self.overlap,
                realdisp=realdisp)

        self.dataset_name_plot = self.dataset_name + f'_magnitude_{str(self.magnitude).lower()}' + f'_overlap_{self.overlap}'
        self.num_user = len(np.unique(TrainLU))

        # nel caso self di realdisp non ho dati per i soggetti 6 e 13
        if realdisp:
            old_user_label = np.unique(TrainLU)
            new_user_label = np.arange(len(old_user_label))
            mapping_user_label = {
                k: v
                for k, v in zip(old_user_label, new_user_label)
            }
            TrainLU = [mapping_user_label[user] for user in TrainLU]
            ValidLU = [mapping_user_label[user] for user in ValidLU]
            TestLU = [mapping_user_label[user] for user in TestLU]

        # if true only accelerometer will be used
        if only_acc:

            if self.magnitude:
                TrainData = TrainData[:, :, [0, 1, 2, 3]]
                ValidData = ValidData[:, :, [0, 1, 2, 3]]
                TestData = TestData[:, :, [0, 1, 2, 3]]
                self.axes = 4
                self.dataset._channel = 4
            else:
                TrainData = TrainData[:, :, [0, 1, 2]]
                ValidData = ValidData[:, :, [0, 1, 2]]
                TestData = TestData[:, :, [0, 1, 2]]
                self.axes = 3
                self.dataset._channel = 3
            self.dataset_name_plot = self.dataset_name_plot + 'only_acc'

        # if true only accelerometer and gyroscope will be used
        if only_acc_gyro:
            if self.magnitude:
                TrainData = TrainData[:, :, [0, 1, 2, 3, 4, 5, 6, 7]]
                ValidData = ValidData[:, :, [0, 1, 2, 3, 4, 5, 6, 7]]
                TestData = TestData[:, :, [
                    0,
                    1,
                    2,
                    3,
                    4,
                    5,
                    6,
                    7,
                ]]
                self.axes = 8
                self.dataset._channel = 8
            else:
                TrainData = TrainData[:, :, [0, 1, 2, 3, 4, 5]]
                ValidData = ValidData[:, :, [0, 1, 2, 3, 4, 5]]
                TestData = TestData[:, :, [0, 1, 2, 3, 4, 5]]
                self.axes = 6
                self.dataset._channel = 6
            self.dataset_name_plot = self.dataset_name_plot + 'only_acc_gyro'

        self.dataset._channel = self.axes

        self.train = TrainData
        self.train_user = TrainLU
        self.train_act = TrainLA
        if realdisp:
            self.train_di = TrainDI
        self.val = ValidData
        self.val_user = ValidLU
        self.val_act = ValidLA
        if realdisp:
            self.val_di = ValidDI
        self.test = TestData
        self.test_user = TestLU
        self.test_act = TestLA
        if realdisp:
            self.test_di = TestDI

    def normalize_data(self):
        # normalize data
        self.train, self.val, self.test = self.dataset.normalize_data(
            self.train, self.val, self.test)

    def tf_dataset(self, method, weighted):
        if weighted == 'no':
            self.create_tensorflow_dataset()
        else:
            if method == 'act':
                datasets, weights = self.create_dataset_for_act(weighted)
            if method == 'subject':
                datasets, weights = self.create_dataset_for_subject(weighted)
            if method == 'act_subject':
                datasets, weights = self.create_dataset_for_act_subject(
                    weighted)
            weights = np.where(weights == float('Inf'), 0, weights)
            dataset_weighted = tf.data.experimental.sample_from_datasets(
                datasets, weights)
            dataset_weighted = dataset_weighted.shuffle(
                buffer_size=self.train.shape[0], reshuffle_each_iteration=True)
            dataset_weighted = dataset_weighted.batch(self.batch_size,
                                                      drop_remainder=True)
            self.train_data = dataset_weighted

        ValData = tf.data.Dataset.from_tensor_slices(self.val)
        ValLA = tf.data.Dataset.from_tensor_slices(self.val_act)
        ValLU = tf.data.Dataset.from_tensor_slices(self.val_user)
        val_data = tf.data.Dataset.zip((ValData, ValLA, ValLU))
        self.val_data = val_data.batch(len(ValData))

        TestData = tf.data.Dataset.from_tensor_slices(self.test)
        TestLA = tf.data.Dataset.from_tensor_slices(self.test_act)
        TestLU = tf.data.Dataset.from_tensor_slices(self.test_user)
        test_data = tf.data.Dataset.zip((TestData, TestLA, TestLU))
        self.test_data = test_data.batch(len(TestData))

    def create_tensorflow_dataset(self):

        TrainData = tf.data.Dataset.from_tensor_slices(self.train)
        TrainLA = tf.data.Dataset.from_tensor_slices(self.train_act)
        TrainLU = tf.data.Dataset.from_tensor_slices(self.train_user)
        train_data = tf.data.Dataset.zip((TrainData, TrainLA, TrainLU))
        train_data = train_data.shuffle(buffer_size=self.train.shape[0],
                                        reshuffle_each_iteration=True)
        train_data = train_data.batch(self.batch_size, drop_remainder=True)
        self.train_data = train_data

    def create_dataset_for_act_subject(self, method='balance'):
        datasets = []
        act_user_sample_count = []
        for user in np.unique(self.train_user):
            idx_user = np.where(self.train_user == user)
            for act in np.unique(self.train_act):
                idx = np.intersect1d(idx_user, np.where(self.train_act == act))
                dataset = tf.data.Dataset.from_tensor_slices(
                    (self.train[idx], self.train_act[idx],
                     self.train_user[idx]))
                datasets.append(dataset)
                act_user_sample_count.append(len(idx))

        if method == 'balance':
            weights = np.repeat(
                1., len(act_user_sample_count)) / act_user_sample_count
        if method == 'train_set':
            n = np.sum(act_user_sample_count)
            weights = act_user_sample_count / \
                np.repeat(n, len(act_user_sample_count))

        return datasets, weights

    def create_dataset_for_subject(self, method='balance'):
        datasets = []
        for user in np.unique(self.train_user):
            idx = np.where(self.train_user == user)
            dataset = tf.data.Dataset.from_tensor_slices(
                (self.train[idx], self.train_act[idx], self.train_user[idx]))
            datasets.append(dataset)

        user_sample_count = [
            np.where(self.train_user == user)[0].shape[0]
            for user in np.unique(self.train_user)
        ]

        if method == 'balance':
            weights = np.repeat(1., len(user_sample_count)) / user_sample_count
        if method == 'train_set':
            n = np.sum(user_sample_count)
            weights = user_sample_count / \
                np.repeat(n, len(user_sample_count))

        return datasets, weights

    def create_dataset_for_act(self, method='balance'):
        '''
            Weight samples in dataset based on inverse activity frequency
        '''
        datasets = []

        for act in np.unique(self.train_act):
            idx = np.where(self.train_act == act)
            temp_d = self.train[idx]
            temp_a = self.train_act[idx]
            temp_u = self.train_user[idx]
            dataset = tf.data.Dataset.from_tensor_slices(
                (temp_d, temp_a, temp_u))
            datasets.append(dataset)

        # Compute samples weight to have batch sample distribution like train set
        activities_sample_count = [
            np.where(self.train_act == act)[0].shape[0]
            for act in np.unique(self.train_act)
        ]

        # to have balance samples in batch
        if method == 'balance':
            weights = np.repeat(
                1., len(activities_sample_count)) / activities_sample_count

        # for have the same distribution of train in every batch
        if method == 'train_set':
            n = np.sum(activities_sample_count)
            weights = activities_sample_count / \
                np.repeat(n, len(activities_sample_count))

        return datasets, weights

    def augment_data(self,
                     function_to_apply=[],
                     augmented_par=[],
                     ratio_random_transformations=1,
                     compose=False,
                     only_compose=False,
                     plot_augmented=False,
                     n_func_to_apply=3):

        shape_original = self.train.shape[0]

        if self.magnitude:
            n_sensor = self.train.shape[2] / 4
        else:
            n_sensor = self.train.shape[2] / 3

        train_augmented, label_user_augmented, label_act_augmented = self.dataset.augment_data(
            self.train, self.train_user, self.train_act, self.magnitude,
            augmented_par, function_to_apply, compose, only_compose,
            plot_augmented, ratio_random_transformations, n_func_to_apply,
            n_sensor)

        self.train = train_augmented
        self.train_user = label_user_augmented
        self.train_act = label_act_augmented

        print('data before augmented {}, data after augmented {}'.format(
            shape_original, train_augmented.shape[0]))

        self.dataset_name_plot = self.dataset_name_plot + '_augmented'
        if self.winlen != 100:
            self.dataset_name_plot = self.dataset_name_plot + '_w_128'

    def build_model(self, stride=1, fc=False):
        print('using model: ', self.model_type)
        if self.model_type == 'resnet18_2D':
            self.model = resnet2D(self.multi_task,
                                  self.num_act,
                                  self.num_user,
                                  stride=stride,
                                  fc=fc)
        if self.model_type == 'resnet18_multi_branch':
            self.model = resnet18MultiBranch(self.sensor_dict, self.num_user,
                                             self.magnitude)
        if self.model_type == 'resnet18_lstm_parallel':
            self.model = parallel(self.multi_task, self.num_act, self.num_user)
        if self.model_type == 'resnet18_lstm_consecutive':
            self.model = consecutive(self.multi_task, self.num_act,
                                     self.num_user)
        if self.model_type == 'resnet18_1D':
            self.model = resnet1D(self.multi_task, self.num_act, self.num_user)
        if self.model_type == 'resnet18_2D_multitask':
            self.model = resne18MultiTask(self.num_act, self.num_user)

        samples = self.winlen
        self.model.build(input_shape=(None, samples, self.axes, 1))

    def print_model_summary(self):
        self.model.summary()

    def loss_opt_metric(self):
        # define loss and optimizer
        self.loss_act = tf.keras.losses.SparseCategoricalCrossentropy()
        self.loss_user = tf.keras.losses.SparseCategoricalCrossentropy()
        self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.init_lr)

        # performance on train
        self.train_loss_activity = tf.keras.metrics.Mean(
            name='train_loss_activity')
        self.train_loss_user = tf.keras.metrics.Mean(name='train_loss_user')
        self.train_accuracy_activity = tf.keras.metrics.SparseCategoricalAccuracy(
            name='train_accuracy_activity')
        self.train_accuracy_user = tf.keras.metrics.SparseCategoricalAccuracy(
            name='train_accuracy_user')
        self.train_precision_user = tf.keras.metrics.Precision()
        self.train_recall_user = tf.keras.metrics.Recall()

        # performance on val
        self.valid_loss_activity = tf.keras.metrics.Mean(
            name='valid_loss_activity')
        self.valid_loss_user = tf.keras.metrics.Mean(name='valid_loss_user')
        self.valid_accuracy_activity = tf.keras.metrics.SparseCategoricalAccuracy(
            name='valid_accuracy_activity')
        self.valid_accuracy_user = tf.keras.metrics.SparseCategoricalAccuracy(
            name='valid_accuracy_user')
        self.val_precision_user = tf.keras.metrics.Precision()
        self.val_recall_user = tf.keras.metrics.Recall()

    @tf.function
    def train_step(self, batch, label_activity, label_user, num_user):
        with tf.GradientTape() as tape:
            if self.multi_task:
                predictions_act, predictions_user = self.model(batch,
                                                               training=True)
                loss_a = self.loss_act(y_true=label_activity,
                                       y_pred=predictions_act)
                loss_u = self.loss_user(y_true=label_user,
                                        y_pred=predictions_user)
                loss_global = loss_a + loss_u
            else:
                predictions_user = self.model(batch, training=True)
                loss_u = self.loss_user(y_true=label_user,
                                        y_pred=predictions_user)
                loss_global = loss_u

        gradients = tape.gradient(loss_global, self.model.trainable_variables)
        self.optimizer.apply_gradients(
            grads_and_vars=zip(gradients, self.model.trainable_variables))
        if self.multi_task:
            self.train_loss_activity.update_state(values=loss_a)
            self.train_accuracy_activity.update_state(y_true=label_activity,
                                                      y_pred=predictions_act)
        self.train_loss_user.update_state(values=loss_u)
        self.train_accuracy_user.update_state(y_true=label_user,
                                              y_pred=predictions_user)

        # confusion matrix on batch
        cm = tf.math.confusion_matrix(label_user,
                                      tf.math.argmax(predictions_user, axis=1),
                                      num_classes=num_user)

        return cm

    @tf.function
    def valid_step(self, batch, label_activity, label_user, num_user):

        if self.multi_task:
            if self.best_model is not None:
                predictions_act, predictions_user = self.best_model(
                    batch, training=False)
                loss_a = self.loss_act(y_true=label_activity,
                                       y_pred=predictions_act)
            else:
                predictions_act, predictions_user = self.model(batch,
                                                               training=False)
                loss_a = self.loss_act(y_true=label_activity,
                                       y_pred=predictions_act)
        else:
            if self.best_model is not None:
                predictions_user = self.best_model(batch, training=False)
            else:
                predictions_user = self.model(batch, training=False)

        loss_u = self.loss_user(y_true=label_user, y_pred=predictions_user)

        if self.multi_task:
            self.valid_loss_activity.update_state(values=loss_a)
            self.valid_accuracy_activity.update_state(y_true=label_activity,
                                                      y_pred=predictions_act)

        self.valid_loss_user.update_state(values=loss_u)
        self.valid_accuracy_user.update_state(y_true=label_user,
                                              y_pred=predictions_user)

        # calculate precision, recall and f1 from confusion matrix
        cm = tf.math.confusion_matrix(label_user,
                                      tf.math.argmax(predictions_user, axis=1),
                                      num_classes=num_user)

        return cm, tf.math.argmax(predictions_user, axis=1)

    def distribution_act_on_batch(self, label_act):
        distribution = {
            act: np.count_nonzero(label_act == act)
            for act in np.unique(label_act)
        }
        pprint.pprint(distribution)

    def train_model(self, epochs):
        self.epochs = epochs
        if self.model_type == 'resnet18_2D_multitask':
            self.train_multi_task()
        elif self.multi_task:
            self.train_multi_task()
        else:
            self.train_single_task()

    def train_single_task(self):

        # best seen to save best model
        best_seen = {
            'epoch': 0,
            'loss': 10,
            'model': None,
            'time_not_improved': 0
        }

        for epoch in range(1, self.epochs + 1):
            cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32)

            ### PERFORMANCE ON TRAIN AFTER EACH EPOCH ###

            for batch, label_act, label_user in self.train_data:
                # self.distribution_act_on_batch(label_act)
                cm_batch = self.train_step(batch, None, label_user,
                                           self.num_user)
                cm = cm + cm_batch
            metrics = custom_metrics(cm)
            if self.log:
                print(
                    "TRAIN: epoch: {}/{}, loss_user: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}"
                    .format(epoch, self.epochs,
                            self.train_loss_user.result().numpy(),
                            self.train_accuracy_user.result().numpy(),
                            metrics['macro_precision'],
                            metrics['macro_recall'], metrics['macro_f1']))

            with self.train_writer.as_default():
                tf.summary.scalar('loss_user',
                                  self.train_loss_user.result(),
                                  step=epoch)
                tf.summary.scalar('accuracy_user',
                                  self.train_accuracy_user.result(),
                                  step=epoch)
                tf.summary.scalar('macro_precision_user',
                                  metrics['macro_precision'],
                                  step=epoch)
                tf.summary.scalar('macro_recall_user',
                                  metrics['macro_recall'],
                                  step=epoch)
                tf.summary.scalar('macro_f1_user',
                                  metrics['macro_f1'],
                                  step=epoch)
            self.train_loss_user.reset_states()
            self.train_accuracy_user.reset_states()

            cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32)

            ### PERFORMANCE ON VALIDATION AFTER EACH EPOCH ###

            temp_predictions_user = []
            temp_label_user = []
            temp_label_act = []

            for batch, label_act, label_user in self.val_data:
                cm_batch, predictions_user = self.valid_step(
                    batch, label_act, label_user, self.num_user)
                cm = cm + cm_batch
                temp_predictions_user.extend(predictions_user.numpy())
                temp_label_user.extend(label_user.numpy())
                temp_label_act.extend(label_act.numpy())
            metrics = custom_metrics(cm)

            if self.log:
                print(
                    "VALIDATION: epoch: {}/{}, loss_user: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}"
                    .format(epoch, self.epochs,
                            self.valid_loss_user.result().numpy(),
                            self.valid_accuracy_user.result().numpy(),
                            metrics['macro_precision'],
                            metrics['macro_recall'], metrics['macro_f1']))

            with self.val_writer.as_default():
                tf.summary.scalar('loss_user',
                                  self.valid_loss_user.result(),
                                  step=epoch)
                tf.summary.scalar('accuracy_user',
                                  self.valid_accuracy_user.result(),
                                  step=epoch)
                tf.summary.scalar('macro_precision_user',
                                  metrics['macro_precision'],
                                  step=epoch)
                tf.summary.scalar('macro_recall_user',
                                  metrics['macro_recall'],
                                  step=epoch)
                tf.summary.scalar('macro_f1_user',
                                  metrics['macro_f1'],
                                  step=epoch)

            # update best seen model based on accuracy of validation
            if self.valid_loss_user.result().numpy() < best_seen['loss']:
                best_seen['loss'] = self.valid_loss_user.result().numpy()
                best_seen['epoch'] = epoch
                best_seen['model'] = self.model
                best_seen['time_not_improved'] = 0
                self.final_pred_right_act = [
                    0 for _ in np.arange(0, self.num_act)
                ]
                self.final_pred_wrong_act = [
                    0 for _ in np.arange(0, self.num_act)
                ]
                self.update_pred_based_on_act(temp_predictions_user,
                                              temp_label_user, temp_label_act)
            else:
                best_seen['time_not_improved'] += 1
                if best_seen['time_not_improved'] >= 6 and epoch > 20:
                    print('early stop')
                    self.valid_loss_user.reset_states()
                    self.valid_accuracy_user.reset_states()
                    break
                elif best_seen['time_not_improved'] == 5:
                    new_lr = self.decay_lr_on_plateau()
                    if new_lr < 0.000001:
                        print('min lr reached')
                        self.valid_loss_user.reset_states()
                        self.valid_accuracy_user.reset_states()
                        break
                    self.optimizer.learning_rate.assign(new_lr)
                    print(f'reduce learning rate on plateau to {new_lr}')

            # reset loss and accuracy after each epoch
            self.valid_loss_user.reset_states()
            self.valid_accuracy_user.reset_states()

        # save best model finished train process (Model.save maybe is more appropriate)
        self.best_model = best_seen['model']

    def train_multi_task(self):
        for epoch in range(1, self.epochs + 1):
            cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32)
            if self.multi_task:
                for batch, label_act, label_user in self.train_data:
                    cm_batch = self.train_step(batch, label_act, label_user,
                                               self.num_user)
                    cm = cm + cm_batch
                metrics = custom_metrics(cm)
                if self.log:
                    print(
                        "TRAIN: epoch: {}/{}, loss_act: {:.5f}, loss_user: {:.5f}, "
                        "acc_act: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}"
                        .format(epoch, self.epochs,
                                self.train_loss_activity.result().numpy(),
                                self.train_loss_user.result().numpy(),
                                self.train_accuracy_activity.result().numpy(),
                                self.train_accuracy_user.result().numpy(),
                                metrics['macro_precision'],
                                metrics['macro_recall'], metrics['macro_f1']))
                with self.train_writer.as_default():
                    tf.summary.scalar('loss_activity',
                                      self.train_loss_activity.result(),
                                      step=epoch)
                    tf.summary.scalar('accuracy_activity',
                                      self.train_accuracy_activity.result(),
                                      step=epoch)
                    tf.summary.scalar('loss_user',
                                      self.train_loss_user.result(),
                                      step=epoch)
                    tf.summary.scalar('accuracy_user',
                                      self.train_accuracy_user.result(),
                                      step=epoch)
                    tf.summary.scalar('macro_precision',
                                      metrics['macro_precision'],
                                      step=epoch)
                    tf.summary.scalar('macro_recall',
                                      metrics['macro_recall'],
                                      step=epoch)
                    tf.summary.scalar('macro_f1',
                                      metrics['macro_f1'],
                                      step=epoch)
                self.train_loss_activity.reset_states()
                self.train_loss_user.reset_states()
                self.train_accuracy_activity.reset_states()
                self.train_accuracy_user.reset_states()
                cm = tf.zeros(shape=(self.num_user, self.num_user),
                              dtype=tf.int32)

                for batch, label_act, label_user in self.test_data:
                    if epoch == self.epochs:
                        cm_batch, predictions_user = self.valid_step(
                            batch, label_act, label_user, self.num_user)
                        cm = cm + cm_batch
                        self.update_pred_based_on_act(predictions_user,
                                                      label_user, label_act)
                    else:
                        cm_batch, _ = self.valid_step(batch, label_act,
                                                      label_user,
                                                      self.num_user)
                        cm = cm + cm_batch
                metrics = custom_metrics(cm)
                with self.val_writer.as_default():
                    tf.summary.scalar('loss_activity',
                                      self.valid_loss_activity.result(),
                                      step=epoch)
                    tf.summary.scalar('accuracy_activity',
                                      self.valid_accuracy_activity.result(),
                                      step=epoch)
                    tf.summary.scalar('loss_user',
                                      self.valid_loss_user.result(),
                                      step=epoch)
                    tf.summary.scalar('accuracy_user',
                                      self.valid_accuracy_user.result(),
                                      step=epoch)
                    tf.summary.scalar('macro_precision',
                                      metrics['macro_precision'],
                                      step=epoch)
                    tf.summary.scalar('macro_recall',
                                      metrics['macro_recall'],
                                      step=epoch)
                    tf.summary.scalar('macro_f1',
                                      metrics['macro_f1'],
                                      step=epoch)
                if self.log:
                    print(
                        "VALIDATION: epoch: {}/{}, loss_act: {:.5f}, loss_user: {:.5f}, "
                        "acc_act: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}"
                        .format(epoch, self.epochs,
                                self.valid_loss_activity.result().numpy(),
                                self.valid_loss_user.result().numpy(),
                                self.valid_accuracy_activity.result().numpy(),
                                self.valid_accuracy_user.result().numpy(),
                                metrics['macro_precision'],
                                metrics['macro_recall'], metrics['macro_f1']))
                self.valid_loss_activity.reset_states()
                self.valid_loss_user.reset_states()
                self.valid_accuracy_activity.reset_states()
                self.valid_accuracy_user.reset_states()

                if self.lr == 'dynamic':
                    new_lr = self.decay_lr(self.init_lr,
                                           self.drop_factor,
                                           self.drop_epoch,
                                           epoch=epoch)
                    self.optimizer.learning_rate.assign(new_lr)
                    with self.train_writer.as_default():
                        tf.summary.scalar("learning_rate", new_lr, step=epoch)

    def decay_lr(self, init_lr, drop_factor, drops_epoch, epoch):

        exp = np.floor((1 + epoch) / drops_epoch)
        alpha = init_lr * (drop_factor**exp)
        return float(alpha)

    def decay_lr_on_plateau(self):
        lr = self.optimizer.learning_rate
        return lr * self.drop_factor

    def test_model(self, log=False):

        # reset variables for plot percentage error respect to activity
        self.final_pred_right_act = [0 for _ in np.arange(0, self.num_act)]
        self.final_pred_wrong_act = [0 for _ in np.arange(0, self.num_act)]
        temp_predictions_user = []
        temp_label_user = []
        temp_label_act = []

        cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32)

        for batch, label_act, label_user in self.test_data:
            cm_batch, predictions_user = self.valid_step(
                batch, label_act, label_user, self.num_user)
            cm = cm + cm_batch
            temp_predictions_user.extend(predictions_user.numpy())
            temp_label_user.extend(label_user.numpy())
            temp_label_act.extend(label_act.numpy())

        metrics = custom_metrics(cm)

        self.update_pred_based_on_act(temp_predictions_user, temp_label_user,
                                      temp_label_act)

        print(
            "\nTEST FINAL: loss_user: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}"
            .format(self.valid_loss_user.result().numpy(),
                    self.valid_accuracy_user.result().numpy(),
                    metrics['macro_precision'], metrics['macro_recall'],
                    metrics['macro_f1']))

        # confusion matrix
        if log:
            df_cm = pd.DataFrame(
                cm.numpy(),
                index=[str(i) for i in range(0, self.num_user)],
                columns=[str(i) for i in range(0, self.num_user)])
            plt.figure(figsize=(30, 21))
            sn.heatmap(df_cm, annot=True)
            plt.show()

        return self.valid_accuracy_user.result().numpy(), metrics['macro_f1']

    def plot_distribution_data(self, val_test=True):

        if val_test:
            if self.test is not None:
                col = 3
            else:
                col = 2
        else:
            col = 1
        row = 2

        plt.figure(figsize=(12, 3))
        plt.style.use('seaborn-darkgrid')

        ### distribution user ###

        user_distributions = []
        for user in np.arange(self.num_user):
            plt.subplot(row, col, 1)
            plt.title('Train user')
            number_user = len([i for i in self.train_user if i == user])
            user_distributions.append(number_user)

        plt.bar(x=list(range(1,
                             len(user_distributions) + 1)),
                height=user_distributions)

        if val_test:
            user_distributions = []
            for user in np.arange(self.num_user):
                plt.subplot(row, col, 2)
                plt.title('Val user')
                number_user = len([i for i in self.val_user if i == user])
                user_distributions.append(number_user)

            plt.bar(x=list(range(1,
                                 len(user_distributions) + 1)),
                    height=user_distributions)
            user_distributions = []
            for user in np.arange(self.num_user):
                plt.subplot(row, col, 3)
                plt.title('Test user')
                number_user = len([i for i in self.test_user if i == user])
                user_distributions.append(number_user)

            plt.bar(x=list(range(1,
                                 len(user_distributions) + 1)),
                    height=user_distributions)

        ### distribution activity ###

        act_distributions = []
        for act in np.arange(self.num_act):
            plt.subplot(row, col, 1 + col)
            plt.title('Train activity')
            number_act = len([i for i in self.train_act if i == act])
            act_distributions.append(number_act)
        plt.bar(x=list(range(1,
                             len(act_distributions) + 1)),
                height=act_distributions)

        if val_test:
            act_distributions = []
            for act in np.arange(self.num_act):
                plt.subplot(row, col, 2 + col)
                plt.title('Val activity')
                number_act = len([i for i in self.val_act if i == act])
                act_distributions.append(number_act)
            plt.bar(x=list(range(1,
                                 len(act_distributions) + 1)),
                    height=act_distributions)
            act_distributions = []
            for act in np.arange(self.num_act):
                plt.subplot(row, col, 3 + col)
                plt.title('Test activity')
                number_act = len([i for i in self.test_act if i == act])
                act_distributions.append(number_act)
            plt.bar(x=list(range(1,
                                 len(act_distributions) + 1)),
                    height=act_distributions)

        ### distribution activity for user for train ###
        distribution = []
        for user in np.arange(self.num_user):
            distribution.append([])
            for act in np.arange(self.num_act):
                samples = len([
                    i for i, (
                        u,
                        a) in enumerate(zip(self.train_user, self.train_act))
                    if a == act and u == user
                ])
                distribution[user].append(samples)

        plt.figure()
        plt.title('Distribution act for user in train set')
        plt.xlabel('User id')
        plt.ylabel('Act id')
        _ = sn.heatmap(np.transpose(distribution),
                       linewidths=0.3,
                       cmap='YlGnBu',
                       annot=True,
                       fmt="d")
        # plt.tight_layout()
        plt.show()

        if val_test:
            ### distribution activity for user for test ###
            distribution = []  # list of user and activity for user
            for user in np.arange(self.num_user):
                distribution.append([])
                for act in np.arange(self.num_act):
                    samples = len([
                        i for i, (
                            u,
                            a) in enumerate(zip(self.val_user, self.val_act))
                        if a == act and u == user
                    ])
                    distribution[user].append(samples)

            plt.figure()
            plt.title('Distribution act for user in val set')
            plt.xlabel('User id')
            plt.ylabel('Act id')
            _ = sn.heatmap(np.transpose(distribution),
                           linewidths=0.3,
                           cmap='YlGnBu',
                           annot=True,
                           fmt="d")
            # plt.tight_layout()
            plt.show()

            ### distribution activity for user for test ###
            distribution = []  # list of user and activity for user
            for user in np.arange(self.num_user):
                distribution.append([])
                for act in np.arange(self.num_act):
                    samples = len([
                        i for i, (
                            u,
                            a) in enumerate(zip(self.test_user, self.test_act))
                        if a == act and u == user
                    ])
                    distribution[user].append(samples)

            plt.figure()
            plt.title('Distribution act for user in test set')
            plt.xlabel('User id')
            plt.ylabel('Act id')
            _ = sn.heatmap(np.transpose(distribution),
                           linewidths=0.3,
                           cmap='YlGnBu',
                           annot=True,
                           fmt="d")
            # plt.tight_layout()
            plt.show()

    def update_pred_based_on_act(self, predictions_user, label_user,
                                 label_activity):

        for pred_label, true_label, act_label in zip(predictions_user,
                                                     label_user,
                                                     label_activity):
            if pred_label == true_label:
                self.final_pred_right_act[act_label] += 1
            else:
                self.final_pred_wrong_act[act_label] += 1

    def total_sample_for_act(self, test):
        total_for_act = [0 for _ in np.arange(0, self.num_act)]
        for act in np.arange(0, self.num_act):
            if test:
                total_for_act[act] += np.unique(self.test_act,
                                                return_counts=True)[1][act]
            else:
                total_for_act[act] += np.unique(self.val_act,
                                                return_counts=True)[1][act]
        return total_for_act

    def plot_pred_based_act(self, title, test, colab_path, save_plot,
                            file_name, show_plot):

        total_for_act = self.total_sample_for_act(test)
        pred_right = np.asarray(self.final_pred_right_act) / \
            np.asarray(total_for_act)

        plot_pred_based_act(correct_predictions=pred_right,
                            label_act=self.mapping_act_label(),
                            title=title,
                            colab_path=colab_path,
                            dataset_name=self.dataset_name_plot,
                            save_plot=save_plot,
                            file_name=file_name,
                            show_plot=show_plot)

        return pred_right

    def unify_act(self, mapping):
        num_class_return, act_train, act_test = self.dataset.unify_act_class(
            self.train_act, self.test_act, mapping)

        self.train_act = act_train
        self.test_act = act_test
        self.num_act = num_class_return
        self.final_pred_right_act = [0 for _ in np.arange(0, self.num_act)]
        self.final_pred_wrong_act = [0 for _ in np.arange(0, self.num_act)]

    def mapping_act_label(self):
        return mapping_act_label(self.dataset_name)