def __init__(self, dir_model, dataset_options, feature_columns, mode, balanced_datasets=True, resample_datasets=False): self.dir_model = dir_model; self.dataset_options = dataset_options; self.dataset = Dataset(self.dataset_options); self.feature_columns = feature_columns; self.mode = mode; self.balanced_datasets = balanced_datasets; self.resample_datasets = resample_datasets return;
def __init__(self, mode, dir_model, dataset_options, balanced_datasets=True): self.dir_model = dir_model self.mode = mode self.dataset_options = dataset_options self.dataset = Dataset(self.dataset_options) self.balanced_datasets = balanced_datasets return
def buildData(self, srcBatch, goldBatch, svo_batch): srcData = [] tgtData = [] if goldBatch else None svoData = [] tgt_extend_vocab = [] if goldBatch else None src_extend_vocab = [] src_oovs_list = [] for i, (srcWords, svo_list) in enumerate(zip(srcBatch, svo_batch)): srcData += [ self.src_dict.convertToIdx(srcWords, Constants.UNK_WORD) ] svoData += [[ self.src_dict.convertToIdx(one_svo, Constants.UNK_WORD) for one_svo in svo_list ]] if goldBatch: tgtData += [ self.tgt_dict.convertToIdx(goldBatch[i], Constants.UNK_WORD, Constants.BOS_WORD, Constants.EOS_WORD) ] if self.opt.pointer_gen: # 存储临时的oov词典 enc_input_extend_vocab, article_oovs = self.article2ids( srcWords, self.src_dict) src_extend_vocab += [enc_input_extend_vocab] src_oovs_list += [article_oovs] if goldBatch: abs_ids_extend_vocab = self.abstract2ids( goldBatch[i], self.tgt_dict, article_oovs) # 覆盖target,用于使用临时词典 vec = [] vec += [self.src_dict.lookup(Constants.BOS_WORD)] vec += abs_ids_extend_vocab vec += [self.src_dict.lookup(Constants.EOS_WORD)] tgt_extend_vocab.append(torch.LongTensor(vec)) if goldBatch: train = { 'src': (srcData, svoData), 'tgt': tgtData, 'src_extend_vocab': src_extend_vocab, 'tgt_extend_vocab': tgt_extend_vocab, 'src_oovs_list': src_oovs_list, } else: train = { 'src': (srcData, svoData), 'src_extend_vocab': src_extend_vocab, 'src_oovs_list': src_oovs_list, } return Dataset(train, self.opt.batch_size, self.opt.cuda, volatile=True, pointer_gen=self.opt.pointer_gen, is_coverage=self.opt.is_coverage)
def load_train_data(): onlinePreprocess.seq_length = opt.max_sent_length_source # 训练的截断 onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0 train_data, vocab_dicts = prepare_data_online(opt) trainData = Dataset(train_data, opt.batch_size, opt.gpus, pointer_gen=opt.pointer_gen, is_coverage=opt.is_coverage) logger.info(' * vocabulary size. source = %d; target = %d' % (vocab_dicts['src'].size(), vocab_dicts['tgt'].size())) logger.info(' * number of training sentences. %d' % len(train_data['src'])) return trainData, vocab_dicts
class NeuralNetDatasetMaker: def __init__(self, mode, dir_model, dataset_options, balanced_datasets=True): self.dir_model = dir_model self.mode = mode self.dataset_options = dataset_options self.dataset = Dataset(self.dataset_options) self.balanced_datasets = balanced_datasets return def createDatasets(self): print('_getFilenameDatasetBalanced: ' + str(self.mode)) filename_dataset_base = self.dataset_options.getFilename() filename_prefix = self.dir_model + os.sep + filename_dataset_base.split( os.sep)[-1][:-4] if self.mode == 'traineval': if self.balanced_datasets: [df_training, df_testing ] = self.dataset.getBalancedSubsetTrainingAndTesting() self.num_samples_train = df_training.shape[0] self.num_samples_validation = df_testing.shape[0] filename_train = filename_prefix + '_balanced_train.csv' filename_eval = filename_prefix + '_balanced_eval.csv' df_training.to_csv(filename_train, line_terminator='\n', index=False) df_testing.to_csv(filename_eval, line_terminator='\n', index=False) print(filename_train) print(filename_eval) else: [training, testing] = self.dataset.getTrainingAndTestingSet() df_training_pos = training[0] df_training_neg = training[1] df_eval_pos = testing[0] df_eval_neg = testing[1] self.num_samples_train = 2 * int(df_training_neg.shape[0]) self.num_samples_validation = 2 * int(df_eval_neg.shape[0]) filename_train_pos = filename_prefix + '_train_pos.csv' filename_train_neg = filename_prefix + '_train_neg.csv' filename_eval_pos = filename_prefix + '_eval_pos.csv' filename_eval_neg = filename_prefix + '_eval_neg.csv' df_training_pos.to_csv(filename_train_pos, line_terminator='\n', index=False) df_training_neg.to_csv(filename_train_neg, line_terminator='\n', index=False) df_eval_pos.to_csv(filename_eval_pos, line_terminator='\n', index=False) df_eval_neg.to_csv(filename_eval_neg, line_terminator='\n', index=False) else: if self.balanced_datasets: df_balanced = self.dataset.getBalancedSubSet() filename_dataset = filename_prefix + '_balanced_' + self.mode + '.csv' df_balanced.to_csv(filename_dataset, line_terminator='\n', index=False) print(filename_dataset) else: print('no valid configuration of datasets and mode..exit') sys.exit() def removeDatasets(self): filename_dataset_base = self.dataset_options.getFilename() filename_prefix = self.dir_model + os.sep + filename_dataset_base.split( os.sep)[-1][:-4] if self.balanced_datasets: filename_dataset = filename_prefix + '_balanced_' + self.mode + '.csv' print('remove: ' + str(filename_dataset)) os.remove(filename_dataset) else: print('no valid configuration of datasets and mode..exit') sys.exit() def _dfToFile(self, df, filename): list_df = [df[i:i + 10000] for i in range(0, df.shape[0], 10000)] list_df[0].to_csv(filename, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename, index=False, line_terminator='\n', header=False, mode='a') def createDatasetsAutoEncoder(self): print('_getFilenameDatasetBalanced: ' + str(self.mode)) filename_dataset_base = self.dataset_options.getFilename() filename_prefix = self.dir_model + os.sep + filename_dataset_base.split( os.sep)[-1][:-4] if self.mode == 'traineval': df = self.dataset.getData() df = df.sample(frac=1) print('num samples: ' + str(df.shape[0])) print('df.shape: ' + str(df.shape)) num_samples = df.shape[0] ratio_train_test = self.dataset_options.getRatioTrainingSamples() df_train = df[:int(round(ratio_train_test * num_samples))] df_eval = df[int(round(ratio_train_test * num_samples)):] filename_train = filename_prefix + '_balanced_train.csv' filename_eval = filename_prefix + '_balanced_eval.csv' self._dfToFile(df_train, filename_train) self._dfToFile(df_eval, filename_eval) else: filename_test = filename_prefix + '_test.csv' df = self.dataset.getData() df = df.sample(frac=1) self._dfToFile(df, filename_test)
import os import numpy as np from utils.Dataset import Dataset from model import model_multi_view from utils.cluster import cluster import csv os.environ["CUDA_VISIBLE_DEVICES"] = "1" print(os.environ['CUDA_VISIBLE_DEVICES']) ''' each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx) ae net need pretraining before the whole optimizatoin ''' if __name__ == '__main__': num = 30 data = Dataset('coil_2views') x1, x2, gt = data.load_data() X = dict() X[str(0)], X[str(1)] = x1, x2 acc_H_all = np.zeros(num) nmi_H_all = np.zeros(num) RI_H_all = np.zeros(num) f1_H_all = np.zeros(num) para_lambda = 1 batch_size = X['0'].shape[0] lr_pre = 1.0e-3 lr_ae = 1.0e-3 lr_dg = 1.0e-3 lr_h = 1.0e-2 epochs_pre = 300
device = torch.device("cuda") G = AEI_Net(512).to(device) D = MultiscaleDiscriminator(input_nc=3, ndf=64, n_layers=6, norm_layer=torch.nn.InstanceNorm2d).to(device) G.train() D.train() arcface = Backbone(50, 0.6, 'ir_se').to(device) arcface.eval() arcface.load_state_dict(torch.load("./model_weights/model_ir_se50.pth")) dataset = Dataset("./inputs/processed") dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0) MSE = torch.nn.MSELoss() L1 = torch.nn.L1Loss() def hinge_loss(X, positive=True): if positive: return torch.relu(1 - X).mean() return torch.relu(X).mean() def get_grid_image(X): X = X[:8]
class NeuralNetDatasetHandler: def __init__(self, dir_model, dataset_options, feature_columns, mode, balanced_datasets=True, resample_datasets=False): self.dir_model = dir_model; self.dataset_options = dataset_options; self.dataset = Dataset(self.dataset_options); self.feature_columns = feature_columns; self.mode = mode; self.balanced_datasets = balanced_datasets; self.resample_datasets = resample_datasets return; def _parse_csv(self, value): # print('Parsing', data_file) column_names = self.dataset.getColumnsData(); default_values = self.feature_columns.getDefaultValues(column_names) columns = tf.decode_csv(value, record_defaults=default_values) features = dict(zip(column_names, columns)) early_readmission_flagname = self.dataset_options.getEarlyReadmissionFlagname(); labels = features.pop(early_readmission_flagname) return features, tf.equal(labels, 1) def _parse_csv_autoencoder(self, value): # print('Parsing', data_file) column_names = self.dataset.getColumnsData(); default_values = self.feature_columns.getDefaultValues(column_names) columns = tf.decode_csv(value, record_defaults=default_values); features = dict(zip(column_names, columns)) numeric_id_labels = features.pop('main_diag_ind'); return features, tf.convert_to_tensor(numeric_id_labels); def _parse_csv_encode_maindiag(self, value): # print('Parsing', data_file) column_names = self.dataset.getColumnsData(); default_values = self.feature_columns.getDefaultValues(column_names) columns = tf.decode_csv(value, record_defaults=default_values); features = dict(zip(column_names, columns)) numeric_id_labels = features.pop('main_diag_ind'); features = {'diag': features.pop('main_diag')}; return features, tf.convert_to_tensor(numeric_id_labels); def _getFilenameDatasetBalanced(self): filename_dataset_base = self.dataset_options.getFilename(); filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(os.sep)[-1][:-4]; if self.mode == 'train': filename_train = filename_prefix + '_balanced_train.csv' filename = filename_train; elif self.mode == 'eval': filename_eval = filename_prefix + '_balanced_eval.csv' filename = filename_eval; elif self.mode == 'test': filename_test = filename_prefix + '_balanced_test.csv' filename = filename_test; else: print('unknown mode...exit') sys.exit(); return filename; def _getFilenamesDatasetAll(self): filename_dataset_base = self.dataset_options.getFilename(); filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(os.sep)[-1][:-4]; if self.mode == 'train': filename_train_pos = filename_prefix + '_train_pos.csv' filename_train_neg = filename_prefix + '_train_neg.csv' filenames = [filename_train_pos, filename_train_neg]; elif self.mode == 'eval': filename_eval_pos = filename_prefix + '_eval_pos.csv' filename_eval_neg = filename_prefix + '_eval_neg.csv' filenames = [filename_eval_pos, filename_eval_neg]; elif self.mode == 'test': filename_test_pos = filename_prefix + '_test_pos.csv' filename_test_neg = filename_prefix + '_test_neg.csv' filenames = [filename_test_pos, filename_test_neg]; else: print('unknown mode...exit') sys.exit(); return filenames; def _getFilenameDatasetAutoEncoder(self): filename_dataset_base = self.dataset_options.getFilename(); filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(os.sep)[-1][:-4]; if self.mode == 'train': filename_train = filename_prefix + '_balanced_train.csv' filename = filename_train; elif self.mode == 'eval': filename_eval = filename_prefix + '_balanced_eval.csv' filename = filename_eval; elif self.mode == 'test': filename_test = filename_prefix + '_test.csv' filename = filename_test; else: print('unknown mode...exit') sys.exit(); return filename; def _dataset_reader(self): if self.balanced_datasets: filename_dataset = self._getFilenameDatasetBalanced() # shuffle is only performed for training; not optimal --> maybe five another flag to specify training/eval print('read: ' + str(filename_dataset)) dataset = tf.data.TextLineDataset(filename_dataset) dataset = dataset.skip(1) if self.mode == 'train': dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamplesBalancedSubset()) dataset = dataset.map(self._parse_csv, num_parallel_calls=5) return dataset; else: filenames_dataset = self._getFilenamesDatasetAll(); data_file_pos = filenames_dataset[0]; data_file_neg = filenames_dataset[1]; # Extract lines from input files using the Dataset API. ds_pos = tf.data.TextLineDataset(data_file_pos) ds_neg = tf.data.TextLineDataset(data_file_neg) ds_pos = ds_pos.skip(1) ds_neg = ds_neg.skip(1) ds_neg = ds_neg.map(self._parse_csv, num_parallel_calls=5) ds_pos = ds_pos.map(self._parse_csv, num_parallel_calls=5) dataset = tf.data.Dataset.zip((ds_pos, ds_neg)) # Each input element will be converted into a two-element `Dataset` using # `Dataset.from_tensors()` and `Dataset.concatenate()`, then `Dataset.flat_map()` # will flatten the resulting `Dataset`s into a single `Dataset`. dataset = dataset.flat_map( lambda ex_pos, ex_neg: tf.data.Dataset.from_tensors(ex_pos).concatenate( tf.data.Dataset.from_tensors(ex_neg))) if self.mode == 'train': dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamplesBalancedSubset()) return dataset; def _dataset_reader_autoencoder(self): if self.balanced_datasets: filename_dataset = self._getFilenameDatasetAutoEncoder(); print(filename_dataset) # shuffle is only performed for training; not optimal --> maybe five another flag to specify training/eval dataset = tf.data.TextLineDataset(filename_dataset) dataset = dataset.skip(1) if self.mode == 'train': dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamples()) dataset = dataset.map(self._parse_csv_autoencoder, num_parallel_calls=5) return dataset; else: filenames_dataset = self._getFilenameDatasetAutoEncoder(); data_file_pos = filenames_dataset[0]; data_file_neg = filenames_dataset[1]; # Extract lines from input files using the Dataset API. ds_pos = tf.data.TextLineDataset(data_file_pos) ds_neg = tf.data.TextLineDataset(data_file_neg) ds_pos = ds_pos.skip(1) ds_neg = ds_neg.skip(1) ds_neg = ds_neg.map(self._parse_csv_autoencoder, num_parallel_calls=5) ds_pos = ds_pos.map(self._parse_csv_autoencoder, num_parallel_calls=5) dataset = tf.data.Dataset.zip((ds_pos, ds_neg)) # Each input element will be converted into a two-element `Dataset` using # `Dataset.from_tensors()` and `Dataset.concatenate()`, then `Dataset.flat_map()` # will flatten the resulting `Dataset`s into a single `Dataset`. dataset = dataset.flat_map( lambda ex_pos, ex_neg: tf.data.Dataset.from_tensors(ex_pos).concatenate( tf.data.Dataset.from_tensors(ex_neg))) if self.mode == 'train': dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamples()) return dataset; def _dataset_reader_encode_main_diag(self): filename_dataset = self._getFilenameDatasetAutoEncoder(); print(filename_dataset) # shuffle is only performed for training; not optimal --> maybe five another flag to specify training/eval dataset = tf.data.TextLineDataset(filename_dataset) dataset = dataset.skip(1) if self.mode == 'train': dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamples()) dataset = dataset.map(self._parse_csv_encode_maindiag, num_parallel_calls=5) return dataset; def update_model_dir(self, dir_model): self.dir_model = dir_model; def readDatasetTF(self): return self._dataset_reader(); def readDatasetAE(self): return self._dataset_reader_autoencoder(); def getDatasetEncodeMainDiag(self): return self._dataset_reader_encode_main_diag();
def encode(flags_obj): """Run Wide-Deep training and eval loop. Args: flags_obj: An object containing parsed flag values. """ dict_data_training = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '20012016', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_training = DatasetOptions(dict_data_training) dict_data_encoding = { 'dir_data': DIRPROJECT + 'data/', 'data_prefix': 'nz', 'dataset': '2017', 'encoding': 'embedding', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' }, 'grouping': 'verylightgrouping' } dataset_options_encoding = DatasetOptions(dict_data_encoding) feature_columns = FeatureColumnsAutoEncoderNZ( dataset_options=dataset_options_encoding) dict_dataset_options = { 'train': dataset_options_training, 'eval': None, 'test': dataset_options_encoding } nn = AutoEncoderModel('test', dict_dataset_options, feature_columns, flags_obj) diag_encodings = nn.encode() print('diag_encodings --> main diag: ' + str(diag_encodings[0].shape)) print('diag_encodings --> secondary diags: ' + str(diag_encodings[1].shape)) main_diag_encodings = diag_encodings[0] sec_diag_encodings = diag_encodings[1] dataset_encoding = Dataset(dataset_options_encoding) df_encoding = dataset_encoding.getDf() print('df_encoding: ' + str(df_encoding.shape)) num_encoded_dim = main_diag_encodings.shape[1] dir_data = dataset_options_encoding.getDirData() dataset = dataset_options_encoding.getDatasetName() data_prefix = dataset_options_encoding.getDataPrefix() demographic_featurename = dataset_options_encoding.getFilenameOptionDemographicFeatures( ) featureset_str = dataset_options_encoding.getFeatureSetStr() encoding = dataset_options_encoding.getEncodingScheme() name_event_column = dataset_options_encoding.getEventColumnName() name_main_diag = dataset_options_encoding.getNameMainDiag() name_sec_diag = dataset_options_encoding.getNameSecDiag() df_encoding_sec_diag = df_encoding[name_event_column].to_frame() df_encoding_main_diag = df_encoding[name_event_column].to_frame() num_encoded_dim = sec_diag_encodings.shape[1] for k in range(0, num_encoded_dim): new_col_secdiag = name_sec_diag + '_dim_' + str(k) df_encoding_sec_diag[new_col_secdiag] = sec_diag_encodings[:, k] new_col_maindiag = name_main_diag + '_dim_' + str(k) df_encoding_main_diag[new_col_maindiag] = main_diag_encodings[:, k] print('df_encoding_main_diag: ' + str(df_encoding_main_diag.shape)) print('df_encoding_sec_diag: ' + str(df_encoding_sec_diag.shape)) filename_sec_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_sec_diag + '_' + str( num_encoded_dim) + 'dim.csv' filename_main_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_main_diag + '_' + str( num_encoded_dim) + 'dim.csv' list_df = [ df_encoding_sec_diag[i:i + 10000] for i in range(0, df_encoding_sec_diag.shape[0], 10000) ] list_df[0].to_csv(filename_sec_diag_encoding, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename_sec_diag_encoding, index=False, line_terminator='\n', header=False, mode='a') list_df = [ df_encoding_main_diag[i:i + 10000] for i in range(0, df_encoding_main_diag.shape[0], 10000) ] list_df[0].to_csv(filename_main_diag_encoding, index=False, line_terminator='\n') for l in list_df[1:]: l.to_csv(filename_main_diag_encoding, index=False, line_terminator='\n', header=False, mode='a')
logger.info('Cannot find preprocess data %s, program will shut down.', '{}.preprocessed.pickle'.format(train_file_name_prefix)) sys.exit() dev_file_name_prefix, fileExist = checkPreprocessFile( dev_file, add_query_node) if not fileExist: logger.info('Cannot find preprocess data %s, program will shut down.', '{}.preprocessed.pickle'.format(dev_file_name_prefix)) sys.exit() if not evaluation_mode: logger.info('Loading preprocessed training data file %s', '{}.preprocessed.pickle'.format(train_file_name_prefix)) dataset = Dataset(train_file_name_prefix, use_elmo, use_glove, use_extra_feature, max_nodes=500, max_query_size=25, max_candidates=80, max_candidates_len=10) logger.info('Loading preprocessed development data file %s', '{}.preprocessed.pickle'.format(dev_file_name_prefix)) dev_dataset = Dataset(dev_file_name_prefix, use_elmo, use_glove, use_extra_feature, max_nodes=500, max_query_size=25, max_candidates=80, max_candidates_len=10) else: logger.info('Loading preprocessed evaluation data file %s',
Description: Nothing FilePath: /Signal-1/AE2-Nets-master/test_CUB.py ''' from utils.Dataset import Dataset from AE_BinAE_revise import MaeAEModel from model import model from utils.print_result import print_result import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" ''' each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx) ae net need pretraining before the whole optimization ''' if __name__ == '__main__': data = Dataset('CUB_c10_2views') x1, x2, gt = data.load_data() x1 = data.normalize(x1, 0) x2 = data.normalize(x2, 0) n_clusters = len(set(gt)) print(x1.shape) print(x2.shape) print(gt.shape) #act_ae1, act_ae2, act_dg1, act_dg2 = 'sigmoid', 'sigmoid', 'sigmoid', 'sigmoid' v1_aedims_ = [[x1.shape[1], 512, 256], [256, 512, x1.shape[1]]] v2_aedims_ = [[x2.shape[1], 256, 128], [128, 256, x2.shape[1]]] #原来的 mae_dims_ = [[256, 128, 64], [128, 128, 64], [64, 128, 256], [64, 128, 128]] #现在用的
G = AEI_Net(512).to(device) D = MultiscaleDiscriminator(input_nc=3, ndf=64, n_layers=6, norm_layer=torch.nn.InstanceNorm2d).to(device) G.train() D.train() arcface = Backbone(50, 0.6, 'ir_se').to(device) arcface.eval() arcface.load_state_dict(torch.load("./model_weights/model_ir_se50.pth")) opt_G = optim.Adam(G.parameters(), lr=lr_G, betas=(0, 0.999)) opt_D = optim.Adam(D.parameters(), lr=lr_D, betas=(0, 0.999)) dataset = Dataset("./dataset/celeb/", same_prob=0.2) dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0, drop_last=True) MSE = torch.nn.MSELoss() L1 = torch.nn.L1Loss() def hinge_loss(X, positive=True): if positive: return torch.relu(1 - X).mean() return torch.relu(X).mean()
def train(): print("*"*100) print("train begin") # use gpu use_gpu = args.device is not None if torch.cuda.is_available() and not use_gpu: print("WARNING: You have a CUDA device, should run with -device 0") if use_gpu: # set cuda device and seed torch.cuda.set_device(args.device) torch.cuda.manual_seed(args.seed) torch.manual_seed(args.seed) random.seed(args.seed) numpy.random.seed(args.seed) os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id) # 路径准备 embedding_file_path = os.path.join(args.project, "embedding.npz") vocab_file_path = os.path.join(args.project, "word2id.json") end_train_file = os.path.join(args.input, "train_files", "train.txt") train_files_dir = os.path.join(args.input, "train_files") # 合并同后缀文本文件 merge_same_suf_text_file(train_files_dir, end_train_file, '.txt') print('Loading vocab,train and val dataset.Wait a second,please') embed = torch.Tensor(np.load(embedding_file_path)['arr_0']) # embed = torch.Tensor(list(np.load(args.embedding))) with open(vocab_file_path) as f: word2id = json.load(f) vocab = Vocab(embed, word2id) with open(end_train_file) as f: examples = list() for line in tqdm(f): if line and not line.isspace(): examples.append(json.loads(line)) train_dataset = Dataset(examples) print(train_dataset[:1]) args.embed_num = embed.size(0) # 从embeding中读取维度 args.embed_dim = embed.size(1) # args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')] net = getattr(models, args.model)(args, embed) if use_gpu: net.cuda() train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False) criterion = nn.BCELoss() params = sum(p.numel() for p in list(net.parameters())) / 1e6 print('#Params: %.1fM' % (params)) min_loss = float('inf') optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate) net.train() t1 = time() for epoch in range(1, args.max_epoch + 1): print("*"*10, 'epoch ', str(epoch), '*'*50) for i, batch in enumerate(train_iter): print("*"*10, 'batch', i, '*'*10) features, targets, _, doc_lens = vocab.make_features(batch, args.seq_trunc) features, targets = Variable(features), Variable(targets.float()) if use_gpu: features = features.cuda() targets = targets.cuda() probs = net(features, doc_lens) loss = criterion(probs, targets) optimizer.zero_grad() loss.backward() clip_grad_norm(net.parameters(), args.max_norm) optimizer.step() net.save() print('Epoch: %2d Loss: %f' % (epoch, loss)) t2 = time() print('Total Cost:%f h' % ((t2 - t1) / 3600)) print("模型配置文件保存至输出文件夹")
import os import numpy as np from utils.Dataset import Dataset from model import model_multi_view from utils.cluster import cluster import csv os.environ["CUDA_VISIBLE_DEVICES"] = "0" print(os.environ['CUDA_VISIBLE_DEVICES']) ''' each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx) ae net need pretraining before the whole optimizatoin ''' if __name__ == '__main__': num = 30 data = Dataset('handwritten_6views') X, gt = data.load_data() acc_H_all = np.zeros(num) nmi_H_all = np.zeros(num) RI_H_all = np.zeros(num) f1_H_all = np.zeros(num) para_lambda = 1 batch_size = 2000 lr_pre = 1.0e-3 lr_ae = 1.0e-3 lr_dg = 1.0e-3 lr_h = 1.0e-1 epochs_pre = 10 epochs_total = 20
} dict_options_dataset_testing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20162017', 'encoding': 'categorical', 'newfeatures': { 'names': constantsPATREC.NEW_FEATURES }, 'featurereduction': None, 'grouping': 'verylightgrouping', 'filtering': 'EntlassBereich_Gyn' } options_training = DatasetOptions(dict_options_dataset_training) dataset_training = Dataset(dataset_options=options_training) dict_opt_rf = { 'n_estimators': 500, 'max_depth': 50 } options_rf = OptionsRF( dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_rf) clf_rf = ClassifierRF(options_rf) dict_opt_lr = { 'penalty': 'l1', 'C': 0.5 }
dirModelsBase, options_training.getFilenameOptions(filteroptions=True), options_clf=dict_opt_sgd) clf_sgd = ClassifierSGD(options_sgd) dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': '2016', 'newfeatures': { 'names': constantsNZ.NEW_FEATURES }, 'featurereduction': None } options_testing = DatasetOptions(dict_options_dataset_training) dataset_testing = Dataset(dataset_options=options_testing) years = [2012, 2013, 2014, 2015] for year in years: dict_options_dataset_training = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(year), 'newfeatures': { 'names': constantsNZ.NEW_FEATURES }, 'featurereduction': None } options_training = DatasetOptions(dict_options_dataset_training) dataset_training = Dataset(dataset_options=options_training)
parser.add_argument('--gallery_feature_dir', type=str) parser.add_argument('--query_feature_dir', type=str) parser.add_argument('--useCAM', action='store_true') args = parser.parse_args() data_transforms = transforms.Compose([ transforms.Resize((args.img_h, args.img_w)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) # image_datasets = {x: datasets.ImageFolder(os.path.join(args.test_dir, x) ,data_transforms) for x in ['gallery','query']} image_datasets = { x: Dataset(os.path.join(args.test_dir, x), data_transforms, CAM=args.useCAM) for x in ['gallery', 'query'] } # labelsloader = {x: iter(image_datasets[x].imgs) for x in ['gallery', 'query']} dataloaders = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=args.batch_size, shuffle=False, num_workers=4) for x in ['gallery', 'query'] } def load_network(network): save_path = os.path.join(args.model_save_dir,
FilePath: /Signal-1/AE2-Nets-master/test_Caltech.py ''' from utils.Dataset import Dataset from AE_BinAE_revise import MaeAEModel from model import model from utils.print_result import print_result import os from collections import Counter os.environ["CUDA_VISIBLE_DEVICES"] = "0" ''' each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx) ae net need pretraining before the whole optimization ''' if __name__ == '__main__': data = Dataset('Caltech101_7_2views') x1, x2, gt = data.load_data() x1 = data.normalize(x1, 0) x2 = data.normalize(x2, 0) n_clusters = len(set(gt)) print(x1.shape) print(x2.shape) print(n_clusters) #act_ae1, act_ae2, act_dg1, act_dg2 = 'sigmoid', 'sigmoid', 'sigmoid', 'sigmoid' v1_aedims_ = [[x1.shape[1], 1024, 512, 256], [256, 512, 1024, x1.shape[1]]] v2_aedims_ = [[x2.shape[1], 256, 128], [128, 256, x2.shape[1]]] #原来的 mae_dims_ = [[256, 256], [128, 128, 64], [256, 256], [64, 128, 128]] #现在用的 #dims_dg1 = [64, 100]
import torch.nn as nn from torch.autograd import Variable from torch.utils.data import DataLoader from config.test_config import TestConfig import os import numpy as np from PIL import Image opt = TestConfig().parse() model = CycleGAN(opt) model.load_state_dict( torch.load('log/snapshot/' + opt.name + '_snapshot_' + str(opt.epoch) + '.pkl')) model.eval() model.cuda() dataset = Dataset(opt) data_loader = DataLoader(dataset, batch_size=1, shuffle=opt.shuffle, num_workers=4) pic_dir = opt.pic_dir for iteration, input in enumerate(data_loader): model.deal_with_input(input) model.test() g_A = model.generated_A.cpu().numpy() g_B = model.generated_B.cpu().numpy() c_A = model.cycled_A.cpu().numpy() c_B = model.cycled_B.cpu().numpy() #g_A = Image.fromarray(((g_A+1.)/2.*255).astype(np.uint8).transpose(1,2,0)) #g_A.save(os.path.join(pic_dir, 'generated_A_'+str(opt.epoch)+'.png'))
import os import numpy as np from utils.Dataset import Dataset from model import model_multi_view from utils.cluster import cluster import csv os.environ["CUDA_VISIBLE_DEVICES"] = "1" print(os.environ['CUDA_VISIBLE_DEVICES']) ''' each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx) ae net need pretraining before the whole optimizatoin ''' if __name__ == '__main__': num = 30 data = Dataset('ORL_3views') X, gt = data.load_data() acc_H_all = np.zeros(num) nmi_H_all = np.zeros(num) RI_H_all = np.zeros(num) f1_H_all = np.zeros(num) para_lambda = 1 batch_size = X['0'].shape[0] lr_pre = 1.0e-3 lr_ae = 1.0e-3 lr_dg = 1.0e-3 lr_h = 1.0e-2 epochs_pre = 50 epochs_total = 200
def learn( env, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation entcoeff=0.0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) args): # Setup losses and stuff` # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_fn("pi", ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", ob_space, ac_space) # Network for old policy # Ops to reassign params from new to old assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder( name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent newprob = tf.exp(pi.pd.logp(ac)) oldprob = tf.exp(oldpi.pd.logp(ac)) ratio = newprob / oldprob kl = pi.pd.kl(oldpi.pd) mean_kl = tf.reduce_mean(kl) get_kl = U.function([ob, ac], kl) get_mean_kl = U.function([ob, ac], mean_kl) threshold = kl < args.kl_threshold threshold = tf.cast(threshold, tf.float32) pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold pol_surr = tf.reduce_mean(pol_surr) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)]) adam = MpiAdam(var_list, epsilon=adam_epsilon) compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses) U.initialize() adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards running_scores = [] assert sum([ max_iters > 0, args.num_timesteps > 0, max_episodes > 0, max_seconds > 0 ]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if args.num_timesteps and timesteps_so_far >= args.num_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max( 1.0 - float(timesteps_so_far) / args.num_timesteps, 0) else: raise NotImplementedError if MPI.COMM_WORLD.Get_rank() == 0: logger.log("********** Iteration %i ************" % iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / ( atarg.std() + 1e-8) # standardized advantage function estimate optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy assign_old_eq_new() # set old parameter values to new parameter values d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) # Here we do a bunch of optimization epochs over the data for num_epoch in count(): losses = [ ] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult) g = np.nan_to_num(g) adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) agg_mean_kl = get_mean_kl(ob, ac) if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs: break lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) rewbuffer.extend(rews) mean_score = None if rewbuffer: mean_score = np.mean(rewbuffer) running_scores.append((timesteps_so_far, mean_score)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 if MPI.COMM_WORLD.Get_rank() == 0: logger.record_tabular("EpRewMean", mean_score) logger.record_tabular("EpThisIter", len(lens)) logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) logger.record_tabular("NumEpoch", num_epoch) logger.dump_tabular() return running_scores
from utils.DatasetFilter import DatasetFilter from utils.Dataset import Dataset from utils.DatasetOptions import DatasetOptions import helpers.constants as constants import helpers.constantsNZ as constantsNZ dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/' dirData = dirProject + 'data/' dirPlotsBase = dirProject + 'plots/feature_comparison_wiederkehrer_normal/' dict_options_analyzing = { 'dir_data': dirData, 'data_prefix': 'patrec', 'dataset': '20122015', 'grouping': 'verylightgrouping', 'encoding': 'categorical', 'newfeatures': { 'names': constants.NEW_FEATURES }, 'featurereduction': None, 'filter_options': 'chronic_lung' } options = DatasetOptions(dict_options_analyzing) dataset = Dataset(options) datafilter = DatasetFilter(options) datafilter.filterDataDisease()
#Email: [email protected] #Date: Min 13 Des 2020 02:50:08 WIB from model.nn import NNModel from cf.DiCE import DiCE from sklearn.preprocessing import MinMaxScaler, StandardScaler from utils.Dataset import Dataset from utils.adult_dataset import load_adult_income if __name__ == "__main__": income_df = load_adult_income("data/adult/adult.csv") d = Dataset(dataframe=income_df, continuous_features=[ 'age', 'education', 'educational-num', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country' ], outcome_name='income', scaler=MinMaxScaler()) clf = NNModel(model_path='weights/adult.pth') cf = DiCE(d, clf) test_instance = { 'age': 57, 'workclass': 'Self-Employed', 'education': 2, 'educational-num': 10, 'marital-status': 'Married', 'occupation': 'Service', 'relationship': 'Husband', 'race': 'White', 'gender': 'Male',
def __init__(self, dataset_options, dir_plots): self.dataset_options = dataset_options self.dataset = Dataset(dataset_options=dataset_options) self.dir_plots = dir_plots return
def test_item_file(end_test_file, embedding_file_path, vocab_file_path, use_gpu): embed = torch.Tensor(np.load(embedding_file_path)['arr_0']) with open(vocab_file_path) as f: word2id = json.load(f) vocab = Vocab(embed, word2id) #with open(end_test_file) as f: # examples = [json.loads(line) for line in f] with open(end_test_file) as f: examples = list() for line in f: if line and not line.isspace(): examples.append(json.loads(line)) #print(examples[0]) test_dataset = Dataset(examples) test_iter = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=False) load_dir = os.path.join(args.input, 'model_files', 'CNN_RNN.pt') if use_gpu: checkpoint = torch.load(load_dir) else: checkpoint = torch.load(load_dir, map_location=lambda storage, loc: storage) if not use_gpu: checkpoint['args'].device = None net = getattr(models, checkpoint['args'].model)(checkpoint['args']) net.load_state_dict(checkpoint['model']) if use_gpu: net.cuda() net.eval() doc_num = len(test_dataset) all_targets = [] all_results = [] all_probs = [] all_acc = [] all_p = [] all_r = [] all_f1 = [] all_sum = [] for batch in tqdm(test_iter): features, targets, summaries, doc_lens = vocab.make_features(batch) if use_gpu: probs = net(Variable(features).cuda(), doc_lens) else: probs = net(Variable(features), doc_lens) start = 0 for doc_id, doc_len in enumerate(doc_lens): doc = batch['doc'][doc_id].split('\n')[:doc_len] stop = start + doc_len prob = probs[start:stop] hyp = [] for _p, _d in zip(prob, doc): print(_p) print(_d) if _p > 0.5: hyp.append(_d) if len(hyp) > 0: print(hyp) all_sum.append("###".join(hyp)) else: all_sum.append('') all_targets.append(targets[start:stop]) all_probs.append(prob) start = stop file_path_elems = end_test_file.split('/') file_name = 'TR-' + file_path_elems[len(file_path_elems) - 1] with open(os.path.join(args.output, file_name), mode='w', encoding='utf-8') as f: for text in all_sum: f.write(text.strip() + '\n') for item in all_probs: all_results.append([1 if tmp > 0.5 else 0 for tmp in item.tolist()]) print(len(all_results)) print(len(all_targets)) print(len(all_probs)) for _1, _2, _3 in zip(all_results, all_targets, all_probs): _2 = _2.tolist() _3 = _3.tolist() print("*" * 3) print('probs : ', _3) print('results : ', _1) print('targets : ', _2) tmp_acc = accuracy_score(_1, _2) tmp_p = precision_score(_1, _2) tmp_r = recall_score(_1, _2) tmp_f1 = f1_score(_1, _2) print('acc : ', tmp_acc) print('p : ', tmp_p) print('r : ', tmp_r) print('f1 : ', tmp_f1) all_acc.append(tmp_acc) all_p.append(tmp_p) all_r.append(tmp_r) all_f1.append(tmp_f1) print('all dataset acc : ', np.mean(all_acc)) print('all dataset p : ', np.mean(all_p)) print('all dataset r : ', np.mean(all_r)) print('all dataset f1 : ', np.mean(all_f1)) print('all results length : ', len(all_results))
class DataAnalyzer: def __init__(self, dataset_options, dir_plots): self.dataset_options = dataset_options self.dataset = Dataset(dataset_options=dataset_options) self.dir_plots = dir_plots return def _printValues(self, category_names, occ_wiederkehrer, occ_normal): for k, name in enumerate(category_names): print(name + ': ' + str(occ_wiederkehrer[k]) + ' <-> ' + str(occ_normal[k])) def _getFeatureValues(self, df, name_feature): column_names = self.dataset.getColumnsDf() feature_columns = [] for col in column_names: if col.startswith(name_feature): feature_columns.append(col) df_feature = df[feature_columns] df_feature_wiederkehrer = df_feature.loc[df['Wiederkehrer'] == 1] df_feature_normal = df_feature.loc[df['Wiederkehrer'] == 0] return [df_feature_normal, df_feature_wiederkehrer] def _filterDFdisease(self, feature_name, feature_categories, df_feature_normal, df_feature_wiederkehrer): print(df_feature_wiederkehrer.shape) print(df_feature_normal.shape) series_normal = [] series_wiederkehrer = [] for cat in feature_categories: series_normal.append(df_feature_normal[feature_name + '_' + cat]) series_wiederkehrer.append(df_feature_wiederkehrer[feature_name + '_' + cat]) df_feature_normal_filtered = pd.concat(series_normal, axis=1) df_feature_wiederkehrer_filtered = pd.concat(series_wiederkehrer, axis=1) return [df_feature_normal_filtered, df_feature_wiederkehrer_filtered] # for categorical features def _doComparisonBar(self, df, name_feature): filename_plot = self.dir_plots + 'featurecomparison_' + name_feature + '.png' print(name_feature) categories_feature = self.dataset_options.getFeatureCategories( name_feature) if name_feature == self.dataset_options.getNameMainDiag(): if self.dataset_options.getOptionsFiltering( ) in self.dataset_options.getDiseaseNames(): categories_feature = self.dataset_options.getDiseaseICDkeys() print(categories_feature) values_to_count = range(0, len(categories_feature)) [df_feature_normal, df_feature_wiederkehrer] = self._getFeatureValues(df, name_feature) if df_feature_wiederkehrer.shape[1] > 0 and df_feature_normal.shape[ 1] > 0: if name_feature == self.dataset_options.getNameMainDiag(): if self.dataset_options.getOptionsFiltering( ) in self.dataset_options.getDiseaseNames(): [df_feature_normal, df_feature_wiederkehrer ] = self._filterDFdisease(name_feature, categories_feature, df_feature_normal, df_feature_wiederkehrer) num_feature_normal = df_feature_normal.shape[0] num_feature_wiederkehrer = df_feature_wiederkehrer.shape[0] occ_feature_wiederkehrer = df_feature_wiederkehrer.sum(axis=0) occ_feature_normal = df_feature_normal.sum(axis=0) self._printValues(categories_feature, occ_feature_wiederkehrer, occ_feature_normal) occ_wiederkehrer = occ_feature_wiederkehrer.values occ_normal = occ_feature_normal.values density_normal = occ_normal / float(num_feature_normal) density_wiederkehrer = occ_wiederkehrer / float( num_feature_wiederkehrer) print(len(values_to_count)) print(density_wiederkehrer.shape) fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 10)) plt.bar(values_to_count, height=density_wiederkehrer.flatten(), width=1.0, align='center', color='b', alpha=0.5) plt.bar(values_to_count, height=density_normal.flatten(), width=1.0, align='center', color='m', alpha=0.5) plt.xlim([-1, len(categories_feature) + 1]) plt.xticks(range(0, len(values_to_count)), categories_feature) plt.legend(['Wiederkehrer', 'normal']) plt.title(name_feature) plt.draw() plt.savefig(filename_plot, format='png') plt.close() # for numerical features def _doComparisonHist(self, df, name_feature): filename_plot = self.dir_plots + 'featurecomparison_' + name_feature + '.png' print(name_feature) [df_feature_normal, df_feature_wiederkehrer] = self._getFeatureValues(df, name_feature) if df_feature_wiederkehrer.shape[1] > 0 and df_feature_normal.shape[ 1] > 0: num_values_normal = df_feature_normal.shape[0] num_values_wiederkehrer = df_feature_wiederkehrer.shape[0] values_wiederkehrer = df_feature_wiederkehrer.values values_normal = df_feature_normal.values print('normal: ' + str(df_feature_normal.shape)) print('normal: ' + str(df_feature_wiederkehrer.shape)) if num_values_normal > 0 and num_values_wiederkehrer > 0: min_value = float( min(min(values_normal), min(values_wiederkehrer))) max_value = float( max(max(values_normal), max(values_wiederkehrer))) elif num_values_wiederkehrer > 0: min_value = float(min(values_wiederkehrer)) max_value = float(max(values_wiederkehrer)) elif num_values_normal > 0: min_value = float(min(values_normal)) max_value = float(max(values_normal)) else: pass num_different_values = np.unique( np.vstack([values_wiederkehrer, values_normal])).shape[0] if num_different_values > 100: num_bins_hist = 100 else: num_bins_hist = num_different_values print('min value: ' + str(min_value)) print('max value: ' + str(max_value)) range_hist = [min_value, max_value] # print(bins_hist) hist_feature_wiederkehrer, bins_wiederkehrer = np.histogram( values_wiederkehrer, range=range_hist, bins=num_bins_hist, density=True) hist_feature_normal, bins_normal = np.histogram(values_normal, range=range_hist, bins=num_bins_hist, density=True) hist_feature_wiederkehrer = hist_feature_wiederkehrer / hist_feature_wiederkehrer.sum( ) hist_feature_normal = hist_feature_normal / hist_feature_normal.sum( ) bar_width_wiederkehrer = bins_wiederkehrer[ 1:] - bins_wiederkehrer[:-1] bar_width_normal = bins_normal[1:] - bins_normal[:-1] fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 10)) plt.bar(bins_wiederkehrer[:-1], height=hist_feature_wiederkehrer, width=bar_width_wiederkehrer, align='edge', color='b', alpha=0.5) plt.bar(bins_normal[:-1], height=hist_feature_normal, width=bar_width_normal, align='edge', color='m', alpha=0.5) plt.legend(['Wiederkehrer', 'normal']) plt.title(name_feature) plt.draw() plt.savefig(filename_plot, format='png') plt.close() # ideal would be to automatically select the comparison type from the feature name # would need to give a flag with the feature name # i dont know if that would be practical in the long run # but like this it is not ideal either def doFeatureComparison(self): df = self.dataset.getDf() df_wiederkehrer = df['Wiederkehrer'] print('num_wiederkehrer: ' + str(df_wiederkehrer.sum(axis=0))) self._doComparisonHist(df, 'ratio_los_age') self._doComparisonHist(df, 'ratio_numDK_age') self._doComparisonHist(df, 'ratio_numOE_age') self._doComparisonHist(df, 'ratio_los_numDK') self._doComparisonHist(df, 'ratio_los_numOE') self._doComparisonHist(df, 'mult_los_numCHOP') self._doComparisonHist(df, 'ratio_numCHOP_age') self._doComparisonHist(df, 'Eintrittsalter') self._doComparisonHist(df, 'Verweildauer') self._doComparisonHist(df, 'numDK') self._doComparisonHist(df, 'numOE') self._doComparisonHist(df, 'numCHOP') self._doComparisonHist(df, 'Langlieger') self._doComparisonHist(df, 'equalOE') self._doComparisonHist(df, 'previous_visits') self._doComparisonHist(df, 'diff_drg_alos') self._doComparisonHist(df, 'diff_drg_lowerbound') self._doComparisonHist(df, 'diff_drg_upperbound') self._doComparisonHist(df, 'rel_diff_drg_alos') self._doComparisonHist(df, 'rel_diff_drg_lowerbound') self._doComparisonHist(df, 'rel_diff_drg_upperbound') self._doComparisonHist(df, 'alos') self._doComparisonHist(df, 'ratio_drg_los_alos') self._doComparisonBar(df, 'EntlassBereich') self._doComparisonBar(df, 'Versicherungsklasse') self._doComparisonBar(df, 'Geschlecht') self._doComparisonBar(df, 'Forschungskonsent') self._doComparisonBar(df, 'Entlassjahr') self._doComparisonBar(df, 'Entlassmonat') self._doComparisonBar(df, 'Entlasstag') self._doComparisonBar(df, 'Aufnahmeart') self._doComparisonBar(df, 'Entlassart') self._doComparisonBar(df, 'Eintrittsart') self._doComparisonBar(df, 'Liegestatus') self._doComparisonBar(df, 'Hauptdiagnose') # self._doComparisonBar(df, 'CHOP'); def _getRatioWiederkehrerFlag(self): early_readmission_flag = self.dataset_options.getEarlyReadmissionFlagname( ) df = self.dataset.getDf() df_wiederkehrer = df[early_readmission_flag] num_wiederkehrer = int(df_wiederkehrer.sum(axis=0)) num_all = int(df.shape[0]) print('num all: ' + str(num_all)) print('num_wiederkehrer: ' + str(df_wiederkehrer.sum(axis=0))) print('ratio wiederkehrer: ' + str(float(num_wiederkehrer) / float(num_all))) def _getRatio18DaysReturn(self): df = self.dataset.getDf() df = df.sort_values(by=['Patient', 'Aufnahmedatum']) patient_ids_wiederkehrer = df['Patient'].unique() single_visiting_patients = 0 for k in range(0, len(patient_ids_wiederkehrer)): p_id = patient_ids_wiederkehrer[k] cases_df = df.loc[df['Patient'] == p_id] new_patient = True if cases_df.shape[0] == 1: single_visiting_patients += 1 for index, row in cases_df.iterrows(): if not new_patient: timestamp_enter = row['Aufnahmedatum'] diff = (datetime.fromtimestamp(timestamp_enter) - datetime.fromtimestamp(timestamp_previous_exit)) days = diff.days if int(days) <= 18: # print(str(datetime.fromtimestamp(timestamp_enter).strftime("%y,%m,%d")) + ' vs. ' + str(datetime.fromtimestamp(timestamp_previous_exit).strftime("%y,%m,%d"))) # print(str(int(row['Patient'])) + ': ' + ' --> ' + str(days) + ' --> ' + str(row['Wiederkehrer'])) df.at[index_previous, 'Wiederkehrer'] = 1 else: new_patient = False timestamp_previous_exit = row['Entlassdatum'] index_previous = index num_wiederkehrer_all = int(df['Wiederkehrer'].sum(axis=0)) num_all = int(df.shape[0]) print('patients with only a single visit: ' + str(single_visiting_patients)) print('num all: ' + str(num_all)) print('num wiedekehrer all: ' + str(num_wiederkehrer_all)) print('ratio wiederkehrer all: ' + str(float(num_wiederkehrer_all) / float(num_all))) def checkWiederkehrer(self): self._getRatioWiederkehrerFlag() if self.dataset_options.getDataPrefix() == 'patrec': self._getRatio18DaysReturn() def _getNumberColumnsSubgroupPatrec(self, subgroup): dir_data = self.dataset_options.getDirData() dataset = self.dataset_options.getDatasetName() chunksize = self.dataset_options.getChunkSize() filename_data_subgroup = dir_data + 'data_patrec_' + dataset + '_' + subgroup + '_clean.csv' subgroup_data_reader = pd.read_csv(filename_data_subgroup, chunksize=chunksize) for k, chunk in enumerate(subgroup_data_reader): chunk = chunk.drop(self.dataset_options.getEventColumnName(), axis=1) columns = list(chunk.columns) sum_chunk = chunk.sum(axis=0) if k == 0: sum_subgroup = pd.DataFrame(data=np.zeros((1, len(columns))), columns=columns) sum_subgroup = sum_subgroup.add(sum_chunk) num_columns = int(sum_subgroup.astype(bool).sum(axis=1).values) print(subgroup + ' --> number of columns: ' + str(len(columns))) print(subgroup + ' --> number of non-zero columns: ' + str(num_columns)) def _getAvgNumSubgroupPatrec(self, subgroup): dir_data = self.dataset_options.getDirData() dataset = self.dataset_options.getDatasetName() name_demographic_features = self.dataset_options.getFilenameOptionDemographicFeatures( ) encoding = self.dataset_options.getEncodingScheme() feature_set_str = self.dataset_options.getFeatureSetStr() filename_data_subgroup = dir_data + 'data_patrec_' + dataset + '_' + name_demographic_features + '_' + feature_set_str + '_' + encoding + '.csv' df = pd.read_csv(filename_data_subgroup) df_num_subgroup = df['num' + subgroup] avg_num = np.mean(df_num_subgroup.values) return avg_num def _getAvgNumSubgroupNZ(self): dir_data = self.dataset_options.getDirData() dataset = self.dataset_options.getDatasetName() name_demographic_features = self.dataset_options.getFilenameOptionDemographicFeatures( ) grouping = self.dataset_options.getGroupingName() encoding = self.dataset_options.getEncodingScheme() feature_set_str = self.dataset_options.getFeatureSetStr() filename_data_subgroup = dir_data + 'data_nz_' + dataset + '_' + feature_set_str + '_' + encoding + '_' + grouping + '.csv' df = pd.read_csv(filename_data_subgroup) df_num_subgroup = df['diag_DIAG_COUNT'] avg_num = np.mean(df_num_subgroup.values) return avg_num def _getNumberColumnsSubgroupNZ(self, subgroup): dir_data = self.dataset_options.getDirData() dataset = self.dataset_options.getDatasetName() chunksize = self.dataset_options.getChunkSize() filename_data_subgroup = dir_data + 'data_nz_' + dataset + '_' + subgroup + '_clean.csv' def _getNumberHauptdiagnosePatrec(self): dir_data = self.dataset_options.getDirData() dataset = self.dataset_options.getDatasetName() filename_data = dir_data + 'data_patrec_' + dataset + '_REST_clean.csv' df = pd.read_csv(filename_data) diff_values_hauptdiagnose = list(set(df['Hauptdiagnose'].values)) print('Hauptdiagnose --> number of values: ' + str(len(diff_values_hauptdiagnose))) def _getNumberHauptdiagnoseNZ(self): dir_data = self.dataset_options.getDirData() dataset = self.dataset_options.getDatasetName() filename_data = dir_data + 'data_nz_' + dataset + '_discharge.csv' df = pd.read_csv(filename_data) diff_values_hauptdiagnose = list(set(df['main_diag'].values)) print('Hauptdiagnose --> number of values: ' + str(len(diff_values_hauptdiagnose))) def getNumberColumnsSubgroup(self, subgroup): data_prefix = self.dataset_options.getDataPrefix() if data_prefix == 'patrec': self._getNumberColumnsSubgroupPatrec(subgroup) elif data_prefix == 'nz': pass else: print('data prefix is unknown...exit') sys.exit() def getNumberHauptdiagnose(self): data_prefix = self.dataset_options.getDataPrefix() if data_prefix == 'patrec': self._getNumberHauptdiagnosePatrec() elif data_prefix == 'nz': self._getNumberHauptdiagnoseNZ() else: print('data prefix is unknown...exit') sys.exit() def getAvgNumberSubgroup(self, subgroup): data_prefix = self.dataset_options.getDataPrefix() if data_prefix == 'patrec': avg_num = self._getAvgNumSubgroupPatrec(subgroup) return avg_num elif data_prefix == 'nz': if not subgroup == 'DK': print('only implemented for diagnoses...exit') sys.exit() avg_num = self._getAvgNumSubgroupNZ() return avg_num else: print('unknown data prefix..exit') sys.exit()
parser.add_argument('--data_dir', type=str, default='./data') parser.add_argument('--save_dir', type=str, default='./saves') parser.add_argument('--conf_dir', type=str, default='./conf') parser.add_argument('--seed', type=int, default=225) conf = parser.parse_args() model_conf = Params(os.path.join(conf.conf_dir, conf.model.lower() + '.json')) np.random.seed(conf.seed) torch.random.manual_seed(conf.seed) device = torch.device('cuda') if torch.cuda.is_available() else torch.device( 'cpu') dataset = Dataset(data_dir=conf.data_dir, data_name=model_conf.data_name, train_ratio=model_conf.train_ratio, device=device) log_dir = os.path.join('saves', conf.model) logger = Logger(log_dir) model_conf.save(os.path.join(logger.log_dir, 'config.json')) eval_pos, eval_target = dataset.eval_data() item_popularity = dataset.item_popularity evaluator = Evaluator(eval_pos, eval_target, item_popularity, model_conf.top_k) model_base = getattr(models, conf.model) model = model_base(model_conf, dataset.num_users, dataset.num_items, device) logger.info(model_conf) logger.info(dataset)
import numpy as np from utils.Dataset import Dataset from model import model_multi_view from utils.cluster import cluster import csv os.environ["CUDA_VISIBLE_DEVICES"] = "0" print(os.environ['CUDA_VISIBLE_DEVICES']) ''' each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx) ae net need pretraining before the whole optimizatoin ''' if __name__ == '__main__': num = 10 data = Dataset('COIL20_3views') X, gt = data.load_data() acc_H_all = np.zeros(num) nmi_H_all = np.zeros(num) RI_H_all = np.zeros(num) f1_H_all = np.zeros(num) para_lambda = 1 batch_size = X['0'].shape[0] lr_pre = 1.0e-3 lr_ae = 1.0e-3 lr_dg = 1.0e-3 lr_h = 1.0e-2 epochs_pre = 300 epochs_total = 100
import tensorflow as tf import numpy as np import scipy.io as scio from utils.Net_ae import Net_ae from utils.Net_dg import Net_dg from utils.next_batch import next_batch import math from sklearn.utils import shuffle import timeit from keras.layers import * from utils.print_result import print_result from keras.models import Model from utils.Dataset import Dataset data = Dataset('handwritten_2views') x1, x2, gt = data.load_data() x1 = data.normalize(x1, 0) x2 = data.normalize(x2, 0) n_clusters = len(set(gt)) def xavier_init(fan_in, fan_out, constant=1): low = -constant * np.sqrt(6.0 / (fan_in + fan_out)) high = constant * np.sqrt(6.0 / (fan_in + fan_out)) return tf.random_uniform((fan_in, fan_out), minval=low, maxval=high, dtype=tf.float32) class dualModel: def __init__(self,epochs): self.epochs=epochs def train_model(self,X1, X2, gt, para_lambda, dims, act, lr, epochs, batch_size): err_total = list() start = timeit.default_timer()
for year in years: print('year: ' + str(year)) dict_options_dataset = { 'dir_data': dirData, 'data_prefix': 'nz', 'dataset': str(year), 'encoding': 'embedding', 'grouping': 'verylightgrouping', 'newfeatures': None, 'featurereduction': { 'method': 'FUSION' } } options_dataset_year = DatasetOptions(dict_options_dataset) dataset_year = Dataset(options_dataset_year) if balanced: df_year = dataset_year.getBalancedSubSet() else: df_year = dataset_year.getDf() #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd) print(df_year.shape) df_all_years = df_all_years.append(df_year) print('df balanced all years: ' + str(df_all_years.shape)) encoding = options_dataset_year.getEncodingScheme() grouping = options_dataset_year.getGroupingName() featureset = options_dataset_year.getFeatureSetStr() filename_data_years = dirData + 'data_nz_' + str(min(years)) + str(