def create_dataset(self, run_colab, colab_path): if self.magnitude: channel = self.configuration.config[ self.dataset_name]['WINDOW_AXES'] + len( list(self.configuration.config[self.dataset_name] ['SENSOR_DICT'].keys())) else: channel = self.configuration.config[ self.dataset_name]['WINDOW_AXES'] path = self.configuration.config[ self.dataset_name]['PATH_OUTER_PARTITION'] # joint to path of drive data if run_colab: path = colab_path + ''.join(path.split('.')[1:]) if '128' not in self.outer_dir: winlen = self.configuration.config[ self.dataset_name]['WINDOW_SAMPLES'] else: winlen = 128 self.winlen = winlen self.dataset = Dataset(path=path, channel=channel, winlen=winlen, user_num=self.configuration.config[ self.dataset_name]['NUM_CLASSES_USER'], act_num=self.configuration.config[ self.dataset_name]['NUM_CLASSES_ACTIVITY'], outer_dir=self.outer_dir)
def transform(self, data_file, batch_size, data_type="test", shuffle=False, device=None): """ Transform raw text from data_file to Dataset and create data loader. """ raw_data = self.read_data(data_file, data_type=data_type) examples = self.build_examples(raw_data) data = Dataset(examples) data_loader = data.create_batches(batch_size, shuffle, device) return data_loader
def load_data(self, prepared_data_file=None): """ load_data """ prepared_data_file = prepared_data_file or self.prepared_data_file print("Loading prepared data from {} ...".format(prepared_data_file)) data = torch.load(prepared_data_file) self.data = {"train": Dataset(data['train']), "valid": Dataset(data["valid"]), "test": Dataset(data["test"])} print("Number of examples:", " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
def train(args): """ trains the reading comprehension model """ logger = logging.getLogger("brc") logger.info('check the directories...') for dir_path in [ os.path.join(args.model_dir, args.data_type), os.path.join(args.result_dir, args.data_type), os.path.join(args.summary_dir, args.data_type) ]: if not os.path.exists(dir_path): logger.warning( "don't exist {} directory, so we create it!".format(dir_path)) os.makedirs(dir_path) # data_type 容易和 data files 不一致,此处判断下 for f in args.train_files + args.dev_files + args.test_files: if args.data_type not in f: raise ValueError('Inconsistency between data_type and files') logger.info('Load data_set and vocab...') vocab_path = os.path.join(args.vocab_dir, args.data_type, args.vocab_file) with open(vocab_path, 'rb') as fin: logger.info('load vocab from {}'.format(vocab_path)) vocab = pickle.load(fin) brc_data = Dataset( args.max_p_num, args.max_p_len, args.max_q_len, args.max_a_len, train_answer_len_cut_bins=args.train_answer_len_cut_bins, train_files=args.train_files, dev_files=args.dev_files, badcase_sample_log_file=args.badcase_sample_log_file) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab, args.use_oov2unk) logger.info('Initialize the model...') rc_model = MultiAnsModel(vocab, args) logger.info('Training the model...') rc_model.train_and_evaluate_several_batchly( data=brc_data, epochs=args.epochs, batch_size=args.batch_size, evaluate_cnt_in_one_epoch=args.evaluate_cnt_in_one_epoch, save_dir=os.path.join(args.model_dir, args.data_type), save_prefix=args.desc + args.algo) logger.info('Done with model training!')
def load_dataset(splits=('train', 'dev', 'test'), domains='all', strict=False, base_path=None, elmo=False): """ :param splits: :param domains: filter for domains (if 'all', use all available) :param strict: if True, select only dialogs that contain only a single domain :return: """ path = base_path if base_path else dann # TODO implement filtering with `domains` and `strict` with open(os.path.join(path, 'ontology.json')) as f: ontology = Ontology.from_dict(json.load(f)) with open(os.path.join(path, 'vocab.json')) as f: vocab = Vocab.from_dict(json.load(f)) with open(os.path.join(path, 'emb.json')) as f: E = json.load(f) w2v = {w: E[i] for i, w in enumerate(vocab.to_dict()['index2word'])} dataset = {} for split in splits: with open(os.path.join(path, '{}.json'.format(split))) as f: logging.warn('loading split {}'.format(split)) dataset[split] = Dataset.from_dict(json.load(f)) logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()}))) return dataset, ontology, vocab, w2v
def collect_demo(self): dataset = Dataset(self.max_length) for idx, _df in enumerate(self.df_list): t = list() for idx in range(_df.shape[0]-1): # before termination? t.append(transition( obs=self._pos2state(np.around(_df['f1'][idx], self.around_digit), np.around(_df['f2'][idx], self.around_digit)), act=self.inv_action_idx[(np.around(_df['a1'][idx+1], self.around_digit), np.around(_df['a2'][idx+1], self.around_digit))], next_obs=self._pos2state(np.around(_df['f1'][idx+1], self.around_digit), np.around(_df['f2'][idx+1], self.around_digit)), rew=1.0)) dataset.append(t) self.goal_states.append(self._pos2state(np.around(_df['f1'][_df.shape[0]-1], self.around_digit), np.around(_df['f2'][_df.shape[0]-1], self.around_digit))) return dataset
def reload(self, data_type='test'): data_file = os.path.join(self.data_dir, self.data_prefix + "." + data_type) data_raw = self.read_data(data_file, data_type="test") data_examples = self.build_examples(data_raw) self.data[data_type] = Dataset(data_examples) print("Number of examples:", " ".join("{}-{}".format(k.upper(), len(v)) for k, v in self.data.items()))
def get_dataset(id: str) -> (Dataset, str): """ Get a dataset from the built-in datasets. :param id: The id (must be equal to the Datasets enum name) of the dataset :return: A fully loaded dataset, A message for the user """ dataset = Dataset.built_in(id) msg = "Dataset \'{} ({})\' loaded successfully. For further information about this dataset please visit: {}"\ .format(dataset.id.name, dataset.name, dataset.url) log.info(msg) log.info("\n{}".format(dataset.df.head())) return dataset, msg
def get_dataset(name: str, url: str) -> (Dataset, str): """ Get a dataset from an URL (external source). :param name: The name of the dataset. :param url: The URL from which the dataset should be (down-)loaded :return: A fully loaded dataset, A message for the user """ dataset = Dataset.from_url(name, url) msg = "Dataset \'{} ({})\' loaded successfully. For further information about this dataset please visit: {}"\ .format(dataset.id.name, dataset.name, dataset.url) log.info(msg) log.info("\n{}".format(dataset.df.head())) return dataset, msg
def evaluate(args): """ evaluate the trained model on dev files """ logger = logging.getLogger("brc") logger.info('Load data_set and vocab...') vocab_path = os.path.join(args.vocab_dir, args.data_type, args.vocab_file) with open(vocab_path, 'rb') as fin: logger.info('load vocab from {}'.format(vocab_path)) vocab = pickle.load(fin) assert len(args.dev_files) > 0, 'No dev files are provided.' # data_type 容易和 data files 不一致,此处判断下 for f in args.train_files + args.dev_files + args.test_files: if args.data_type not in f: raise ValueError('Inconsistency between data_type and files') brc_data = Dataset(args.max_p_num, args.max_p_len, args.max_q_len, dev_files=args.dev_files, badcase_sample_log_file=args.badcase_sample_log_file) logger.info('Converting text into ids...') brc_data.convert_to_ids(vocab, args.use_oov2unk) logger.info('Build the model...') rc_model = MultiAnsModel(vocab, args) logger.info('restore model from {}, with prefix {}'.format( os.path.join(args.model_dir, args.data_type), args.desc + args.algo)) rc_model.restore(model_dir=os.path.join(args.model_dir, args.data_type), model_prefix=args.desc + args.algo) logger.info('Evaluating the model on dev set...') dev_batches = brc_data.gen_mini_batches('dev', args.batch_size, pad_id=vocab.get_id( vocab.pad_token), shuffle=False) total_batch_count = brc_data.get_data_length( 'dev') // args.batch_size + int( brc_data.get_data_length('dev') % args.batch_size != 0) dev_loss, dev_bleu_rouge = rc_model.evaluate(total_batch_count, dev_batches, result_dir=os.path.join( args.result_dir, args.data_type), result_prefix='dev.predicted') logger.info('Loss on dev set: {}'.format(dev_loss)) logger.info('Result on dev set: {}'.format(dev_bleu_rouge)) logger.info('Predicted answers are saved to {}'.format( os.path.join(args.result_dir)))
def generate_dataset_elmo(elmo, splits=('train', 'dev', 'test'), domains='all', strict=False, base_path=None): """ """ path = base_path if base_path else '' with open(os.path.join(path, 'ontology.json')) as f: ontology = Ontology.from_dict(json.load(f)) dataset = {} for split in splits: with open(os.path.join(path, '{}.json'.format(split))) as f: logging.warn('loading split {}'.format(split)) data = Dataset.from_dict(json.load(f)) #data.dialogues = data.dialogues[:500] data.to_elmo(elmo) dataset[split] = data logging.info('dataset sizes: {}'.format(pformat({k: len(v) for k, v in dataset.items()}))) return dataset, ontology
import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import make_blobs from models.linear_regression import LinearRegressionModel from util.dataset import Dataset num_centers = 2 lineres = 100 X, y = make_blobs(n_samples=30, n_features=2, centers=num_centers, center_box=(-10.0, 10.0)) print(X) print(y) dataset = Dataset(X, y) xdata = dataset.get_feature(0) ydata = dataset.get_feature(1) xmin = np.min(xdata) - 3 xmax = np.max(xdata) + 3 ymin = np.min(ydata) - 3 ymax = np.max(ydata) + 3 regression_model = LinearRegressionModel(2) regression_model.fit(dataset) xline = [xmin, xmax] yline = [regression_model.predict([xmin]), regression_model.predict([xmax])] plt.figure()
common.set_flags() common.make_dirs(os.path.join(FLAGS.save_dir, "dataset_ready")) env = Game.make("KoreanChess-v1", {"use_check": False, "limit_step": FLAGS.max_step, "print_mcts_history": FLAGS.print_mcts_history, "use_color_print": FLAGS.use_color_print, "use_cache": FLAGS.use_cache}) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) model = Model(sess, weight_decay=FLAGS.weight_decay, momentum=FLAGS.momentum, num_layers=FLAGS.num_model_layers, use_cache=FLAGS.use_cache, conf=FLAGS) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() ds = Dataset(sess) while True: common.restore_model(FLAGS.save_dir, "best_model.ckpt", saver, sess) now = common.now_date_str_nums() dataset_path = os.path.join(FLAGS.save_dir, ("dataset_%s_%s.csv" % (now, uuid.uuid4()))) ds.open(dataset_path) game_results = {"b": 0, "r": 0, "d": 0} episode = 0 while True: """""" """self-play""" log("self-play episode %d" % episode) info, state_history, mcts_history = play.self_play(env, model, FLAGS.max_simulation, FLAGS.max_step, FLAGS.c_puct, FLAGS.exploration_step, FLAGS.reuse_mcts, FLAGS.print_mcts_tree, FLAGS.num_state_history,
p = Parameters() params = p.get_parameters() workspace = Workspace(params.w, params.em, params.exp_id) if os.path.exists(workspace.base): shutil.rmtree(workspace.base) os.makedirs(workspace.base) os.makedirs(workspace.result_dir) # fresh start generate train data from raw for mining and embedding dataset = Dataset( workspace, train_ratio=params.tr_ratio, shuffle=True, load_existing_test_files=params.load_existing_test_files, load_existing_test_files_sparsity=params.load_existing_test_files_sparsity) global_iters = params.g_iters print(str(params)) params_dump_file = open(workspace.base + "/params.txt", "w") n = params_dump_file.write(str(params)) params_dump_file.close() for iter_id in range(global_iters): print("Global Iter: ", iter_id) # run embedding model if iter_id == 0:
sess = tf.Session(config=config) writer = tf.summary.FileWriter(FLAGS.save_dir + '/summary', sess.graph) model = Model(sess, weight_decay=FLAGS.weight_decay, momentum=FLAGS.momentum, num_layers=FLAGS.num_model_layers, use_cache=FLAGS.use_cache, conf=FLAGS) sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() learning_rate = FLAGS.learning_rate common.restore_model(FLAGS.save_dir, FLAGS.model_file_name, saver, sess) dataset_path = os.path.join(FLAGS.save_dir, "dataset.csv") ds = Dataset(sess) ds.open(dataset_path) game_results = {"b": 0, "r": 0, "d": 0} wins = 0 for episode in range(FLAGS.max_episode): """""" """self-play""" print("self-play episode %d" % episode) info, state_history, mcts_history = play.self_play( env, model, FLAGS.max_simulation, FLAGS.max_step, FLAGS.c_puct, FLAGS.exploration_step, FLAGS.reuse_mcts, FLAGS.print_mcts_tree, FLAGS.num_state_history, FLAGS.print_mcts_search) if info["winner"]: game_results[info["winner"]] += 1 wins += 1
# print "loading train data..." # x_u, x_r, y, _ = util.load_data(config.train_file, True, config.neg_sample) # train_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict) # print np.array(train_dataset.ques_idx).shape, np.array(train_dataset.rela_idx).shape, np.array( # train_dataset.label).shape # # print "loading dev data..." # x_u, x_r, y, _ = util.load_data(config.dev_file, True, config.neg_sample) # dev_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict) # print np.array(dev_dataset.ques_idx).shape, np.array(dev_dataset.rela_idx).shape, np.array(dev_dataset.label).shape print("loading test data...") x_u, x_r, y, _ = util.load_data(config.test_file, False, config.num_classes) print(np.array(x_u).shape, np.array(x_r).shape, np.array(y).shape) print(x_u[0], x_r[0], y[0]) test_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict) print( np.array(test_dataset.ques_idx).shape, np.array(test_dataset.rela_idx).shape, np.array(test_dataset.label).shape) # print "training..." # train_nn(train_dataset, dev_dataset, config.max_sent_len, embedding) print("testing...") test_nn(test_dataset, config.max_sent_len, embedding) end = time.time() print('total time: %s' % str(end - start))
print("time {}, test loss {:g}, train acc {:g}".format( end - start, test_loss / test_set.size, test_correct_num / test_set.size)) if __name__ == "__main__": start = time.time() print("loading word embedding...") word_dict, embedding = util.get_pretrained_word_vector( config.word2vec_file, (config.voc_size, config.emb_size)) print("vocabulary size: %d" % len(word_dict)) print("loading train data...") x_u, x_r, y, _ = util.load_data(config.train_file, True, config.neg_sample) train_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict) print( np.array(train_dataset.ques_idx).shape, np.array(train_dataset.rela_idx).shape, np.array(train_dataset.label).shape) print("train dataset length:") print(train_dataset.ques_lens, train_dataset.rela_lens) print("loading dev data...") x_u, x_r, y, _ = util.load_data(config.dev_file, True, config.neg_sample) dev_dataset = Dataset(x_u, x_r, y, config.max_sent_len, word_dict) print( np.array(dev_dataset.ques_idx).shape, np.array(dev_dataset.rela_idx).shape, np.array(dev_dataset.label).shape)
def prepare(args): """ checks data, creates the directories, prepare the vocabulary and embeddings """ logger = logging.getLogger() logger.info('Checking the data files...') for data_path in args.train_files + args.dev_files + args.test_files: assert os.path.exists(data_path), '{} file does not exist.'.format( data_path) logger.info('Preparing the directories...') for dir_path in [ os.path.join(args.vocab_dir, args.data_type), os.path.join(args.model_dir, args.data_type), os.path.join(args.result_dir, args.data_type), os.path.join(args.summary_dir, args.data_type) ]: if not os.path.exists(dir_path): os.makedirs(dir_path) # data_type 容易和 data files 不一致,此处判断下 for f in args.train_files + args.dev_files + args.test_files: if args.data_type not in f: raise ValueError('Inconsistency between data_type and files') if args.create_vocab: logger.info('load train dataset...') brc_data = Dataset( args.max_p_num, args.max_p_len, args.max_q_len, args.max_a_len, train_answer_len_cut_bins=args.train_answer_len_cut_bins, train_files=args.train_files, badcase_sample_log_file=args.badcase_sample_log_file) logger.info('Building vocabulary...') vocab = Vocab( init_random=args.initial_tokens_random, trainable_oov_cnt_threshold=args.trainable_oov_cnt_threshold) for word in brc_data.word_iter('train'): vocab.add(word) unfiltered_vocab_size = vocab.size() vocab.filter_tokens_by_cnt(min_cnt=args.vocab_min_cnt) filtered_num = unfiltered_vocab_size - vocab.size() logger.info( 'After filter {} tokens, the final vocab size is {}'.format( filtered_num, vocab.size())) logger.info('Assigning embeddings...') if args.pretrained_word_path is not None: logger.info('load the pretrained word embeddings...') vocab.build_embedding_matrix(args.pretrained_word_path) else: logger.info('random init word embeddings...') vocab.randomly_init_embeddings(args.embed_size) logger.info('Saving vocab...') vocab_path = os.path.join(args.vocab_dir, args.data_type, args.vocab_file) with open(vocab_path, 'wb') as fout: pickle.dump(vocab, fout) logger.info('Done with preparing!')
len(labeled_data), len(unlabeled_data), len(dev_data))) # Tokenizing tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') labeled_texts = [data[0] for data in labeled_data] labeled_labels = [data[1] for data in labeled_data] if args.do_augment is True: augmented_texts, augmented_labels = back_translate(labeled_texts, labeled_labels) labeled_texts.extend(augmented_texts) labeled_labels.extend(augmented_labels) labeled_encodings = tokenizer(labeled_texts, truncation=True, padding=True) labeled_dataset = Dataset(labeled_encodings, labeled_labels) dev_texts = [data[0] for data in dev_data] dev_labels = [data[1] for data in dev_data] dev_encodings = tokenizer(dev_texts, truncation=True, padding=True) dev_dataset = Dataset(dev_encodings, dev_labels) test_encodings = tokenizer(test_texts, truncation=True, padding=True) test_dataset = Dataset(test_encodings, test_labels) # We keep the label of unlabeled data to track for accuracy of pseudo-labeling unlabeled_texts = [data[0] for data in unlabeled_data] unlabeled_labels = [data[1] for data in unlabeled_data] unlabeled_encodings = tokenizer(unlabeled_texts, truncation=True, padding=True) unlabeled_dataset = Dataset(unlabeled_encodings, unlabeled_labels)
if missing_files(draw, splits): if not os.path.isdir(draw): os.makedirs(draw) if missing_files(dann, files=splits + ['ontology', 'vocab', 'emb']): if not os.path.isdir(dann): os.makedirs(dann) dataset = {} ontology = Ontology() vocab = Vocab() vocab.word2index(['<sos>', '<eos>'], train=True) for s in splits: fname = '{}.json'.format(s) logging.warn('Annotating {}'.format(s)) dataset[s] = Dataset.annotate_raw(os.path.join(draw, fname)) dataset[s].numericalize_(vocab) ontology = ontology + dataset[s].extract_ontology() with open(os.path.join(dann, fname), 'wt') as f: json.dump(dataset[s].to_dict(), f) ontology.numericalize_(vocab) with open(os.path.join(dann, 'ontology.json'), 'wt') as f: json.dump(ontology.to_dict(), f) with open(os.path.join(dann, 'vocab.json'), 'wt') as f: json.dump(vocab.to_dict(), f) logging.warn('Computing word embeddings') embeddings = [GloveEmbedding(), KazumaCharEmbedding()] E = [] for w in tqdm(vocab._index2word): e = []
def self_train(self, labeled_dataset, unlabeled_dataset, guide_type=None, confidence_threshold=0.9): best_accuracy = -1 min_dev_loss = 987654321 print(len(unlabeled_dataset)) print(type(unlabeled_dataset)) for outer_epoch in range(self.config.epochs): sampled_num = len(unlabeled_dataset) // 2 random.shuffle(unlabeled_dataset) sampled_unlabeled = unlabeled_dataset[:sampled_num] sampled_text = [data[0] for data in sampled_unlabeled] sampled_labels = [data[1] for data in sampled_unlabeled] sampled_encodings = self.tokenizer(sampled_text, truncation=True, padding=True) sampled_unlabeled_dataset = Dataset(sampled_encodings, sampled_labels) print('outer_epoch {} sampled unlabeled dataset {}'.format(outer_epoch, len(sampled_unlabeled_dataset))) # pseudo-labeling new_dataset = self.pseudo_labeling(sampled_unlabeled_dataset, confidence_threshold, guide_type) # add pseudo-label into labeled data combined_dataset, new_dataset = self.add_dataset(labeled_dataset, new_dataset) # remove pseudo-label from unlabeled data # unlabeled_dataset = self.remove_dataset(unlabeled_dataset, new_dataset) self.train_loader = DataLoader(combined_dataset, **self.config.train_params) self.early_stopping = EarlyStopping(patience=5, verbose=True) # re-initialize the student model from scratch del self.model, self.optimizer if self.model_type =='baseline': self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=self.config.class_num).to(self.config.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2e-5) else: self.model = BERT_ATTN(num_labels=self.config.class_num).to(self.config.device) self.optimizer = torch.optim.Adam(self.model.parameters(), lr=2e-5) # retrain model with labeled data + pseudo-labeled data best_dev_acc = -1 for inner_epoch in range(self.config.epochs): print('outer_epoch {} inner_epoch {} best_accuracy {}'.format(outer_epoch, inner_epoch, best_accuracy)) self.train_epoch(inner_epoch) dev_loss, dev_acc = self.evaluator.evaluate(self.model, self.valid_loader) self.early_stopping(dev_loss) # save model when current dev_acc is greater than best_dev_acc if dev_acc > best_dev_acc: best_dev_acc = dev_acc if self.model_type =='baseline': self.model.save_pretrained(self.ssl_path) else: self.lexicon = copy.deepcopy(self.lexicon_temp) torch.save({'model_state_dict':self.model.state_dict(), 'optimizer_state_dict':self.optimizer.state_dict(), 'epoch': {'outer_epoch':outer_epoch, 'inner_epoch':inner_epoch}}, self.ssl_path +'/checkpoint.pt') if inner_epoch % 1 == 0: test_loss, test_acc = self.evaluator.evaluate(self.model, self.test_loader, is_test=True) if best_accuracy < test_acc: best_accuracy = test_acc if self.model_type != 'baseline': self.lexicon_temp = {label:{} for label in range(self.config.class_num)} if self.early_stopping.early_stop: print("Early Stopping!") break print('Best accuracy {}'.format(best_accuracy))
def encode_dataset(self, texts, labels): encodings = self.tokenizer(texts, truncation=True, padding=True) dataset = Dataset(encodings, labels) return dataset
class Model(): def __init__(self, dataset_name, configuration_file, multi_task, lr, model_type, fold_test, save_dir='log', outer_dir='OuterPartition/', overlap=5.0, magnitude=False, init_lr=0.001, drop_factor=0.5, drop_epoch=10, path_best_model='', log=False): self.dataset_name = dataset_name self.configuration = configuration_file self.multi_task = multi_task self.lr = lr self.init_lr = init_lr self.drop_factor = drop_factor self.drop_epoch = drop_epoch self.overlap = overlap self.model_type = model_type self.epochs = configuration_file.EPOCHS self.num_act = configuration_file.config[dataset_name][ 'NUM_CLASSES_ACTIVITY'] self.num_user = configuration_file.config[dataset_name][ 'NUM_CLASSES_USER'] self.batch_size = configuration_file.BATCH_SIZE self.sensor_dict = configuration_file.config[dataset_name][ 'SENSOR_DICT'] self.fold_test = fold_test self.log = log self.magnitude = magnitude if magnitude: self.axes = self.configuration.config[ self.dataset_name]['WINDOW_AXES'] + len( list(self.configuration.config[self.dataset_name] ['SENSOR_DICT'].keys())) else: self.axes = self.configuration.config[ self.dataset_name]['WINDOW_AXES'] # to see performance on user identification based on activity done self.history_act_true = [] self.history_act_pred = [] self.history_user_true = [] self.history_user_pred = [] self.outer_dir = outer_dir self.magnitude = magnitude self.train_log_dir = "{}/{}/{}/{}/batch_{}/lr_{}/over_{}/fold_{}/{}/train".format( save_dir, self.model_type, self.dataset_name, 'multi_task' if self.multi_task else 'single_task', self.batch_size, self.lr, str(overlap), self.fold_test, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) self.val_log_dir = "{}/{}/{}/{}/batch_{}/lr_{}/over_{}/fold_{}/{}/val".format( save_dir, self.model_type, self.dataset_name, 'multi_task' if self.multi_task else 'single_task', self.batch_size, self.lr, str(overlap), self.fold_test, datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) self.train_writer = tf.summary.create_file_writer(self.train_log_dir) self.val_writer = tf.summary.create_file_writer(self.val_log_dir) self.final_pred_right_act = [0 for _ in np.arange(0, self.num_act)] self.final_pred_wrong_act = [0 for _ in np.arange(0, self.num_act)] self.best_model = None def create_dataset(self, run_colab, colab_path): if self.magnitude: channel = self.configuration.config[ self.dataset_name]['WINDOW_AXES'] + len( list(self.configuration.config[self.dataset_name] ['SENSOR_DICT'].keys())) else: channel = self.configuration.config[ self.dataset_name]['WINDOW_AXES'] path = self.configuration.config[ self.dataset_name]['PATH_OUTER_PARTITION'] # joint to path of drive data if run_colab: path = colab_path + ''.join(path.split('.')[1:]) if '128' not in self.outer_dir: winlen = self.configuration.config[ self.dataset_name]['WINDOW_SAMPLES'] else: winlen = 128 self.winlen = winlen self.dataset = Dataset(path=path, channel=channel, winlen=winlen, user_num=self.configuration.config[ self.dataset_name]['NUM_CLASSES_USER'], act_num=self.configuration.config[ self.dataset_name]['NUM_CLASSES_ACTIVITY'], outer_dir=self.outer_dir) def load_data(self, only_acc=False, only_acc_gyro=False, realdisp=False): # gat data [examples, window_samples, axes, channel] if realdisp: TrainData, TrainLA, TrainLU, TrainDI, ValidData, ValidLA, ValidLU, ValidDI, TestData, TestLA, TestLU, TestDI = self.dataset.load_data( fold_test=self.fold_test, overlapping=self.overlap, realdisp=realdisp) else: TrainData, TrainLA, TrainLU, ValidData, ValidLA, ValidLU, TestData, TestLA, TestLU = self.dataset.load_data( fold_test=self.fold_test, overlapping=self.overlap, realdisp=realdisp) self.dataset_name_plot = self.dataset_name + f'_magnitude_{str(self.magnitude).lower()}' + f'_overlap_{self.overlap}' self.num_user = len(np.unique(TrainLU)) # nel caso self di realdisp non ho dati per i soggetti 6 e 13 if realdisp: old_user_label = np.unique(TrainLU) new_user_label = np.arange(len(old_user_label)) mapping_user_label = { k: v for k, v in zip(old_user_label, new_user_label) } TrainLU = [mapping_user_label[user] for user in TrainLU] ValidLU = [mapping_user_label[user] for user in ValidLU] TestLU = [mapping_user_label[user] for user in TestLU] # if true only accelerometer will be used if only_acc: if self.magnitude: TrainData = TrainData[:, :, [0, 1, 2, 3]] ValidData = ValidData[:, :, [0, 1, 2, 3]] TestData = TestData[:, :, [0, 1, 2, 3]] self.axes = 4 self.dataset._channel = 4 else: TrainData = TrainData[:, :, [0, 1, 2]] ValidData = ValidData[:, :, [0, 1, 2]] TestData = TestData[:, :, [0, 1, 2]] self.axes = 3 self.dataset._channel = 3 self.dataset_name_plot = self.dataset_name_plot + 'only_acc' # if true only accelerometer and gyroscope will be used if only_acc_gyro: if self.magnitude: TrainData = TrainData[:, :, [0, 1, 2, 3, 4, 5, 6, 7]] ValidData = ValidData[:, :, [0, 1, 2, 3, 4, 5, 6, 7]] TestData = TestData[:, :, [ 0, 1, 2, 3, 4, 5, 6, 7, ]] self.axes = 8 self.dataset._channel = 8 else: TrainData = TrainData[:, :, [0, 1, 2, 3, 4, 5]] ValidData = ValidData[:, :, [0, 1, 2, 3, 4, 5]] TestData = TestData[:, :, [0, 1, 2, 3, 4, 5]] self.axes = 6 self.dataset._channel = 6 self.dataset_name_plot = self.dataset_name_plot + 'only_acc_gyro' self.dataset._channel = self.axes self.train = TrainData self.train_user = TrainLU self.train_act = TrainLA if realdisp: self.train_di = TrainDI self.val = ValidData self.val_user = ValidLU self.val_act = ValidLA if realdisp: self.val_di = ValidDI self.test = TestData self.test_user = TestLU self.test_act = TestLA if realdisp: self.test_di = TestDI def normalize_data(self): # normalize data self.train, self.val, self.test = self.dataset.normalize_data( self.train, self.val, self.test) def tf_dataset(self, method, weighted): if weighted == 'no': self.create_tensorflow_dataset() else: if method == 'act': datasets, weights = self.create_dataset_for_act(weighted) if method == 'subject': datasets, weights = self.create_dataset_for_subject(weighted) if method == 'act_subject': datasets, weights = self.create_dataset_for_act_subject( weighted) weights = np.where(weights == float('Inf'), 0, weights) dataset_weighted = tf.data.experimental.sample_from_datasets( datasets, weights) dataset_weighted = dataset_weighted.shuffle( buffer_size=self.train.shape[0], reshuffle_each_iteration=True) dataset_weighted = dataset_weighted.batch(self.batch_size, drop_remainder=True) self.train_data = dataset_weighted ValData = tf.data.Dataset.from_tensor_slices(self.val) ValLA = tf.data.Dataset.from_tensor_slices(self.val_act) ValLU = tf.data.Dataset.from_tensor_slices(self.val_user) val_data = tf.data.Dataset.zip((ValData, ValLA, ValLU)) self.val_data = val_data.batch(len(ValData)) TestData = tf.data.Dataset.from_tensor_slices(self.test) TestLA = tf.data.Dataset.from_tensor_slices(self.test_act) TestLU = tf.data.Dataset.from_tensor_slices(self.test_user) test_data = tf.data.Dataset.zip((TestData, TestLA, TestLU)) self.test_data = test_data.batch(len(TestData)) def create_tensorflow_dataset(self): TrainData = tf.data.Dataset.from_tensor_slices(self.train) TrainLA = tf.data.Dataset.from_tensor_slices(self.train_act) TrainLU = tf.data.Dataset.from_tensor_slices(self.train_user) train_data = tf.data.Dataset.zip((TrainData, TrainLA, TrainLU)) train_data = train_data.shuffle(buffer_size=self.train.shape[0], reshuffle_each_iteration=True) train_data = train_data.batch(self.batch_size, drop_remainder=True) self.train_data = train_data def create_dataset_for_act_subject(self, method='balance'): datasets = [] act_user_sample_count = [] for user in np.unique(self.train_user): idx_user = np.where(self.train_user == user) for act in np.unique(self.train_act): idx = np.intersect1d(idx_user, np.where(self.train_act == act)) dataset = tf.data.Dataset.from_tensor_slices( (self.train[idx], self.train_act[idx], self.train_user[idx])) datasets.append(dataset) act_user_sample_count.append(len(idx)) if method == 'balance': weights = np.repeat( 1., len(act_user_sample_count)) / act_user_sample_count if method == 'train_set': n = np.sum(act_user_sample_count) weights = act_user_sample_count / \ np.repeat(n, len(act_user_sample_count)) return datasets, weights def create_dataset_for_subject(self, method='balance'): datasets = [] for user in np.unique(self.train_user): idx = np.where(self.train_user == user) dataset = tf.data.Dataset.from_tensor_slices( (self.train[idx], self.train_act[idx], self.train_user[idx])) datasets.append(dataset) user_sample_count = [ np.where(self.train_user == user)[0].shape[0] for user in np.unique(self.train_user) ] if method == 'balance': weights = np.repeat(1., len(user_sample_count)) / user_sample_count if method == 'train_set': n = np.sum(user_sample_count) weights = user_sample_count / \ np.repeat(n, len(user_sample_count)) return datasets, weights def create_dataset_for_act(self, method='balance'): ''' Weight samples in dataset based on inverse activity frequency ''' datasets = [] for act in np.unique(self.train_act): idx = np.where(self.train_act == act) temp_d = self.train[idx] temp_a = self.train_act[idx] temp_u = self.train_user[idx] dataset = tf.data.Dataset.from_tensor_slices( (temp_d, temp_a, temp_u)) datasets.append(dataset) # Compute samples weight to have batch sample distribution like train set activities_sample_count = [ np.where(self.train_act == act)[0].shape[0] for act in np.unique(self.train_act) ] # to have balance samples in batch if method == 'balance': weights = np.repeat( 1., len(activities_sample_count)) / activities_sample_count # for have the same distribution of train in every batch if method == 'train_set': n = np.sum(activities_sample_count) weights = activities_sample_count / \ np.repeat(n, len(activities_sample_count)) return datasets, weights def augment_data(self, function_to_apply=[], augmented_par=[], ratio_random_transformations=1, compose=False, only_compose=False, plot_augmented=False, n_func_to_apply=3): shape_original = self.train.shape[0] if self.magnitude: n_sensor = self.train.shape[2] / 4 else: n_sensor = self.train.shape[2] / 3 train_augmented, label_user_augmented, label_act_augmented = self.dataset.augment_data( self.train, self.train_user, self.train_act, self.magnitude, augmented_par, function_to_apply, compose, only_compose, plot_augmented, ratio_random_transformations, n_func_to_apply, n_sensor) self.train = train_augmented self.train_user = label_user_augmented self.train_act = label_act_augmented print('data before augmented {}, data after augmented {}'.format( shape_original, train_augmented.shape[0])) self.dataset_name_plot = self.dataset_name_plot + '_augmented' if self.winlen != 100: self.dataset_name_plot = self.dataset_name_plot + '_w_128' def build_model(self, stride=1, fc=False): print('using model: ', self.model_type) if self.model_type == 'resnet18_2D': self.model = resnet2D(self.multi_task, self.num_act, self.num_user, stride=stride, fc=fc) if self.model_type == 'resnet18_multi_branch': self.model = resnet18MultiBranch(self.sensor_dict, self.num_user, self.magnitude) if self.model_type == 'resnet18_lstm_parallel': self.model = parallel(self.multi_task, self.num_act, self.num_user) if self.model_type == 'resnet18_lstm_consecutive': self.model = consecutive(self.multi_task, self.num_act, self.num_user) if self.model_type == 'resnet18_1D': self.model = resnet1D(self.multi_task, self.num_act, self.num_user) if self.model_type == 'resnet18_2D_multitask': self.model = resne18MultiTask(self.num_act, self.num_user) samples = self.winlen self.model.build(input_shape=(None, samples, self.axes, 1)) def print_model_summary(self): self.model.summary() def loss_opt_metric(self): # define loss and optimizer self.loss_act = tf.keras.losses.SparseCategoricalCrossentropy() self.loss_user = tf.keras.losses.SparseCategoricalCrossentropy() self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.init_lr) # performance on train self.train_loss_activity = tf.keras.metrics.Mean( name='train_loss_activity') self.train_loss_user = tf.keras.metrics.Mean(name='train_loss_user') self.train_accuracy_activity = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy_activity') self.train_accuracy_user = tf.keras.metrics.SparseCategoricalAccuracy( name='train_accuracy_user') self.train_precision_user = tf.keras.metrics.Precision() self.train_recall_user = tf.keras.metrics.Recall() # performance on val self.valid_loss_activity = tf.keras.metrics.Mean( name='valid_loss_activity') self.valid_loss_user = tf.keras.metrics.Mean(name='valid_loss_user') self.valid_accuracy_activity = tf.keras.metrics.SparseCategoricalAccuracy( name='valid_accuracy_activity') self.valid_accuracy_user = tf.keras.metrics.SparseCategoricalAccuracy( name='valid_accuracy_user') self.val_precision_user = tf.keras.metrics.Precision() self.val_recall_user = tf.keras.metrics.Recall() @tf.function def train_step(self, batch, label_activity, label_user, num_user): with tf.GradientTape() as tape: if self.multi_task: predictions_act, predictions_user = self.model(batch, training=True) loss_a = self.loss_act(y_true=label_activity, y_pred=predictions_act) loss_u = self.loss_user(y_true=label_user, y_pred=predictions_user) loss_global = loss_a + loss_u else: predictions_user = self.model(batch, training=True) loss_u = self.loss_user(y_true=label_user, y_pred=predictions_user) loss_global = loss_u gradients = tape.gradient(loss_global, self.model.trainable_variables) self.optimizer.apply_gradients( grads_and_vars=zip(gradients, self.model.trainable_variables)) if self.multi_task: self.train_loss_activity.update_state(values=loss_a) self.train_accuracy_activity.update_state(y_true=label_activity, y_pred=predictions_act) self.train_loss_user.update_state(values=loss_u) self.train_accuracy_user.update_state(y_true=label_user, y_pred=predictions_user) # confusion matrix on batch cm = tf.math.confusion_matrix(label_user, tf.math.argmax(predictions_user, axis=1), num_classes=num_user) return cm @tf.function def valid_step(self, batch, label_activity, label_user, num_user): if self.multi_task: if self.best_model is not None: predictions_act, predictions_user = self.best_model( batch, training=False) loss_a = self.loss_act(y_true=label_activity, y_pred=predictions_act) else: predictions_act, predictions_user = self.model(batch, training=False) loss_a = self.loss_act(y_true=label_activity, y_pred=predictions_act) else: if self.best_model is not None: predictions_user = self.best_model(batch, training=False) else: predictions_user = self.model(batch, training=False) loss_u = self.loss_user(y_true=label_user, y_pred=predictions_user) if self.multi_task: self.valid_loss_activity.update_state(values=loss_a) self.valid_accuracy_activity.update_state(y_true=label_activity, y_pred=predictions_act) self.valid_loss_user.update_state(values=loss_u) self.valid_accuracy_user.update_state(y_true=label_user, y_pred=predictions_user) # calculate precision, recall and f1 from confusion matrix cm = tf.math.confusion_matrix(label_user, tf.math.argmax(predictions_user, axis=1), num_classes=num_user) return cm, tf.math.argmax(predictions_user, axis=1) def distribution_act_on_batch(self, label_act): distribution = { act: np.count_nonzero(label_act == act) for act in np.unique(label_act) } pprint.pprint(distribution) def train_model(self, epochs): self.epochs = epochs if self.model_type == 'resnet18_2D_multitask': self.train_multi_task() elif self.multi_task: self.train_multi_task() else: self.train_single_task() def train_single_task(self): # best seen to save best model best_seen = { 'epoch': 0, 'loss': 10, 'model': None, 'time_not_improved': 0 } for epoch in range(1, self.epochs + 1): cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32) ### PERFORMANCE ON TRAIN AFTER EACH EPOCH ### for batch, label_act, label_user in self.train_data: # self.distribution_act_on_batch(label_act) cm_batch = self.train_step(batch, None, label_user, self.num_user) cm = cm + cm_batch metrics = custom_metrics(cm) if self.log: print( "TRAIN: epoch: {}/{}, loss_user: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}" .format(epoch, self.epochs, self.train_loss_user.result().numpy(), self.train_accuracy_user.result().numpy(), metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'])) with self.train_writer.as_default(): tf.summary.scalar('loss_user', self.train_loss_user.result(), step=epoch) tf.summary.scalar('accuracy_user', self.train_accuracy_user.result(), step=epoch) tf.summary.scalar('macro_precision_user', metrics['macro_precision'], step=epoch) tf.summary.scalar('macro_recall_user', metrics['macro_recall'], step=epoch) tf.summary.scalar('macro_f1_user', metrics['macro_f1'], step=epoch) self.train_loss_user.reset_states() self.train_accuracy_user.reset_states() cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32) ### PERFORMANCE ON VALIDATION AFTER EACH EPOCH ### temp_predictions_user = [] temp_label_user = [] temp_label_act = [] for batch, label_act, label_user in self.val_data: cm_batch, predictions_user = self.valid_step( batch, label_act, label_user, self.num_user) cm = cm + cm_batch temp_predictions_user.extend(predictions_user.numpy()) temp_label_user.extend(label_user.numpy()) temp_label_act.extend(label_act.numpy()) metrics = custom_metrics(cm) if self.log: print( "VALIDATION: epoch: {}/{}, loss_user: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}" .format(epoch, self.epochs, self.valid_loss_user.result().numpy(), self.valid_accuracy_user.result().numpy(), metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'])) with self.val_writer.as_default(): tf.summary.scalar('loss_user', self.valid_loss_user.result(), step=epoch) tf.summary.scalar('accuracy_user', self.valid_accuracy_user.result(), step=epoch) tf.summary.scalar('macro_precision_user', metrics['macro_precision'], step=epoch) tf.summary.scalar('macro_recall_user', metrics['macro_recall'], step=epoch) tf.summary.scalar('macro_f1_user', metrics['macro_f1'], step=epoch) # update best seen model based on accuracy of validation if self.valid_loss_user.result().numpy() < best_seen['loss']: best_seen['loss'] = self.valid_loss_user.result().numpy() best_seen['epoch'] = epoch best_seen['model'] = self.model best_seen['time_not_improved'] = 0 self.final_pred_right_act = [ 0 for _ in np.arange(0, self.num_act) ] self.final_pred_wrong_act = [ 0 for _ in np.arange(0, self.num_act) ] self.update_pred_based_on_act(temp_predictions_user, temp_label_user, temp_label_act) else: best_seen['time_not_improved'] += 1 if best_seen['time_not_improved'] >= 6 and epoch > 20: print('early stop') self.valid_loss_user.reset_states() self.valid_accuracy_user.reset_states() break elif best_seen['time_not_improved'] == 5: new_lr = self.decay_lr_on_plateau() if new_lr < 0.000001: print('min lr reached') self.valid_loss_user.reset_states() self.valid_accuracy_user.reset_states() break self.optimizer.learning_rate.assign(new_lr) print(f'reduce learning rate on plateau to {new_lr}') # reset loss and accuracy after each epoch self.valid_loss_user.reset_states() self.valid_accuracy_user.reset_states() # save best model finished train process (Model.save maybe is more appropriate) self.best_model = best_seen['model'] def train_multi_task(self): for epoch in range(1, self.epochs + 1): cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32) if self.multi_task: for batch, label_act, label_user in self.train_data: cm_batch = self.train_step(batch, label_act, label_user, self.num_user) cm = cm + cm_batch metrics = custom_metrics(cm) if self.log: print( "TRAIN: epoch: {}/{}, loss_act: {:.5f}, loss_user: {:.5f}, " "acc_act: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}" .format(epoch, self.epochs, self.train_loss_activity.result().numpy(), self.train_loss_user.result().numpy(), self.train_accuracy_activity.result().numpy(), self.train_accuracy_user.result().numpy(), metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'])) with self.train_writer.as_default(): tf.summary.scalar('loss_activity', self.train_loss_activity.result(), step=epoch) tf.summary.scalar('accuracy_activity', self.train_accuracy_activity.result(), step=epoch) tf.summary.scalar('loss_user', self.train_loss_user.result(), step=epoch) tf.summary.scalar('accuracy_user', self.train_accuracy_user.result(), step=epoch) tf.summary.scalar('macro_precision', metrics['macro_precision'], step=epoch) tf.summary.scalar('macro_recall', metrics['macro_recall'], step=epoch) tf.summary.scalar('macro_f1', metrics['macro_f1'], step=epoch) self.train_loss_activity.reset_states() self.train_loss_user.reset_states() self.train_accuracy_activity.reset_states() self.train_accuracy_user.reset_states() cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32) for batch, label_act, label_user in self.test_data: if epoch == self.epochs: cm_batch, predictions_user = self.valid_step( batch, label_act, label_user, self.num_user) cm = cm + cm_batch self.update_pred_based_on_act(predictions_user, label_user, label_act) else: cm_batch, _ = self.valid_step(batch, label_act, label_user, self.num_user) cm = cm + cm_batch metrics = custom_metrics(cm) with self.val_writer.as_default(): tf.summary.scalar('loss_activity', self.valid_loss_activity.result(), step=epoch) tf.summary.scalar('accuracy_activity', self.valid_accuracy_activity.result(), step=epoch) tf.summary.scalar('loss_user', self.valid_loss_user.result(), step=epoch) tf.summary.scalar('accuracy_user', self.valid_accuracy_user.result(), step=epoch) tf.summary.scalar('macro_precision', metrics['macro_precision'], step=epoch) tf.summary.scalar('macro_recall', metrics['macro_recall'], step=epoch) tf.summary.scalar('macro_f1', metrics['macro_f1'], step=epoch) if self.log: print( "VALIDATION: epoch: {}/{}, loss_act: {:.5f}, loss_user: {:.5f}, " "acc_act: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}" .format(epoch, self.epochs, self.valid_loss_activity.result().numpy(), self.valid_loss_user.result().numpy(), self.valid_accuracy_activity.result().numpy(), self.valid_accuracy_user.result().numpy(), metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'])) self.valid_loss_activity.reset_states() self.valid_loss_user.reset_states() self.valid_accuracy_activity.reset_states() self.valid_accuracy_user.reset_states() if self.lr == 'dynamic': new_lr = self.decay_lr(self.init_lr, self.drop_factor, self.drop_epoch, epoch=epoch) self.optimizer.learning_rate.assign(new_lr) with self.train_writer.as_default(): tf.summary.scalar("learning_rate", new_lr, step=epoch) def decay_lr(self, init_lr, drop_factor, drops_epoch, epoch): exp = np.floor((1 + epoch) / drops_epoch) alpha = init_lr * (drop_factor**exp) return float(alpha) def decay_lr_on_plateau(self): lr = self.optimizer.learning_rate return lr * self.drop_factor def test_model(self, log=False): # reset variables for plot percentage error respect to activity self.final_pred_right_act = [0 for _ in np.arange(0, self.num_act)] self.final_pred_wrong_act = [0 for _ in np.arange(0, self.num_act)] temp_predictions_user = [] temp_label_user = [] temp_label_act = [] cm = tf.zeros(shape=(self.num_user, self.num_user), dtype=tf.int32) for batch, label_act, label_user in self.test_data: cm_batch, predictions_user = self.valid_step( batch, label_act, label_user, self.num_user) cm = cm + cm_batch temp_predictions_user.extend(predictions_user.numpy()) temp_label_user.extend(label_user.numpy()) temp_label_act.extend(label_act.numpy()) metrics = custom_metrics(cm) self.update_pred_based_on_act(temp_predictions_user, temp_label_user, temp_label_act) print( "\nTEST FINAL: loss_user: {:.5f}, acc_user: {:.5f}, macro_precision: {:.5f}, macro_recall: {:.5f}, macro_f1: {:.5f}" .format(self.valid_loss_user.result().numpy(), self.valid_accuracy_user.result().numpy(), metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'])) # confusion matrix if log: df_cm = pd.DataFrame( cm.numpy(), index=[str(i) for i in range(0, self.num_user)], columns=[str(i) for i in range(0, self.num_user)]) plt.figure(figsize=(30, 21)) sn.heatmap(df_cm, annot=True) plt.show() return self.valid_accuracy_user.result().numpy(), metrics['macro_f1'] def plot_distribution_data(self, val_test=True): if val_test: if self.test is not None: col = 3 else: col = 2 else: col = 1 row = 2 plt.figure(figsize=(12, 3)) plt.style.use('seaborn-darkgrid') ### distribution user ### user_distributions = [] for user in np.arange(self.num_user): plt.subplot(row, col, 1) plt.title('Train user') number_user = len([i for i in self.train_user if i == user]) user_distributions.append(number_user) plt.bar(x=list(range(1, len(user_distributions) + 1)), height=user_distributions) if val_test: user_distributions = [] for user in np.arange(self.num_user): plt.subplot(row, col, 2) plt.title('Val user') number_user = len([i for i in self.val_user if i == user]) user_distributions.append(number_user) plt.bar(x=list(range(1, len(user_distributions) + 1)), height=user_distributions) user_distributions = [] for user in np.arange(self.num_user): plt.subplot(row, col, 3) plt.title('Test user') number_user = len([i for i in self.test_user if i == user]) user_distributions.append(number_user) plt.bar(x=list(range(1, len(user_distributions) + 1)), height=user_distributions) ### distribution activity ### act_distributions = [] for act in np.arange(self.num_act): plt.subplot(row, col, 1 + col) plt.title('Train activity') number_act = len([i for i in self.train_act if i == act]) act_distributions.append(number_act) plt.bar(x=list(range(1, len(act_distributions) + 1)), height=act_distributions) if val_test: act_distributions = [] for act in np.arange(self.num_act): plt.subplot(row, col, 2 + col) plt.title('Val activity') number_act = len([i for i in self.val_act if i == act]) act_distributions.append(number_act) plt.bar(x=list(range(1, len(act_distributions) + 1)), height=act_distributions) act_distributions = [] for act in np.arange(self.num_act): plt.subplot(row, col, 3 + col) plt.title('Test activity') number_act = len([i for i in self.test_act if i == act]) act_distributions.append(number_act) plt.bar(x=list(range(1, len(act_distributions) + 1)), height=act_distributions) ### distribution activity for user for train ### distribution = [] for user in np.arange(self.num_user): distribution.append([]) for act in np.arange(self.num_act): samples = len([ i for i, ( u, a) in enumerate(zip(self.train_user, self.train_act)) if a == act and u == user ]) distribution[user].append(samples) plt.figure() plt.title('Distribution act for user in train set') plt.xlabel('User id') plt.ylabel('Act id') _ = sn.heatmap(np.transpose(distribution), linewidths=0.3, cmap='YlGnBu', annot=True, fmt="d") # plt.tight_layout() plt.show() if val_test: ### distribution activity for user for test ### distribution = [] # list of user and activity for user for user in np.arange(self.num_user): distribution.append([]) for act in np.arange(self.num_act): samples = len([ i for i, ( u, a) in enumerate(zip(self.val_user, self.val_act)) if a == act and u == user ]) distribution[user].append(samples) plt.figure() plt.title('Distribution act for user in val set') plt.xlabel('User id') plt.ylabel('Act id') _ = sn.heatmap(np.transpose(distribution), linewidths=0.3, cmap='YlGnBu', annot=True, fmt="d") # plt.tight_layout() plt.show() ### distribution activity for user for test ### distribution = [] # list of user and activity for user for user in np.arange(self.num_user): distribution.append([]) for act in np.arange(self.num_act): samples = len([ i for i, ( u, a) in enumerate(zip(self.test_user, self.test_act)) if a == act and u == user ]) distribution[user].append(samples) plt.figure() plt.title('Distribution act for user in test set') plt.xlabel('User id') plt.ylabel('Act id') _ = sn.heatmap(np.transpose(distribution), linewidths=0.3, cmap='YlGnBu', annot=True, fmt="d") # plt.tight_layout() plt.show() def update_pred_based_on_act(self, predictions_user, label_user, label_activity): for pred_label, true_label, act_label in zip(predictions_user, label_user, label_activity): if pred_label == true_label: self.final_pred_right_act[act_label] += 1 else: self.final_pred_wrong_act[act_label] += 1 def total_sample_for_act(self, test): total_for_act = [0 for _ in np.arange(0, self.num_act)] for act in np.arange(0, self.num_act): if test: total_for_act[act] += np.unique(self.test_act, return_counts=True)[1][act] else: total_for_act[act] += np.unique(self.val_act, return_counts=True)[1][act] return total_for_act def plot_pred_based_act(self, title, test, colab_path, save_plot, file_name, show_plot): total_for_act = self.total_sample_for_act(test) pred_right = np.asarray(self.final_pred_right_act) / \ np.asarray(total_for_act) plot_pred_based_act(correct_predictions=pred_right, label_act=self.mapping_act_label(), title=title, colab_path=colab_path, dataset_name=self.dataset_name_plot, save_plot=save_plot, file_name=file_name, show_plot=show_plot) return pred_right def unify_act(self, mapping): num_class_return, act_train, act_test = self.dataset.unify_act_class( self.train_act, self.test_act, mapping) self.train_act = act_train self.test_act = act_test self.num_act = num_class_return self.final_pred_right_act = [0 for _ in np.arange(0, self.num_act)] self.final_pred_wrong_act = [0 for _ in np.arange(0, self.num_act)] def mapping_act_label(self): return mapping_act_label(self.dataset_name)