def __init__(self, dir_model, dataset_options, feature_columns, mode, balanced_datasets=True, resample_datasets=False):
     self.dir_model = dir_model;
     self.dataset_options = dataset_options;
     self.dataset = Dataset(self.dataset_options);
     self.feature_columns = feature_columns;
     self.mode = mode;
     self.balanced_datasets = balanced_datasets;
     self.resample_datasets = resample_datasets
     return;
Exemple #2
0
 def __init__(self,
              mode,
              dir_model,
              dataset_options,
              balanced_datasets=True):
     self.dir_model = dir_model
     self.mode = mode
     self.dataset_options = dataset_options
     self.dataset = Dataset(self.dataset_options)
     self.balanced_datasets = balanced_datasets
     return
Exemple #3
0
    def buildData(self, srcBatch, goldBatch, svo_batch):
        srcData = []
        tgtData = [] if goldBatch else None
        svoData = []
        tgt_extend_vocab = [] if goldBatch else None
        src_extend_vocab = []
        src_oovs_list = []
        for i, (srcWords, svo_list) in enumerate(zip(srcBatch, svo_batch)):
            srcData += [
                self.src_dict.convertToIdx(srcWords, Constants.UNK_WORD)
            ]
            svoData += [[
                self.src_dict.convertToIdx(one_svo, Constants.UNK_WORD)
                for one_svo in svo_list
            ]]

            if goldBatch:
                tgtData += [
                    self.tgt_dict.convertToIdx(goldBatch[i],
                                               Constants.UNK_WORD,
                                               Constants.BOS_WORD,
                                               Constants.EOS_WORD)
                ]

            if self.opt.pointer_gen:
                # 存储临时的oov词典
                enc_input_extend_vocab, article_oovs = self.article2ids(
                    srcWords, self.src_dict)
                src_extend_vocab += [enc_input_extend_vocab]
                src_oovs_list += [article_oovs]
                if goldBatch:
                    abs_ids_extend_vocab = self.abstract2ids(
                        goldBatch[i], self.tgt_dict, article_oovs)
                    # 覆盖target,用于使用临时词典
                    vec = []
                    vec += [self.src_dict.lookup(Constants.BOS_WORD)]
                    vec += abs_ids_extend_vocab
                    vec += [self.src_dict.lookup(Constants.EOS_WORD)]
                    tgt_extend_vocab.append(torch.LongTensor(vec))

        if goldBatch:
            train = {
                'src': (srcData, svoData),
                'tgt': tgtData,
                'src_extend_vocab': src_extend_vocab,
                'tgt_extend_vocab': tgt_extend_vocab,
                'src_oovs_list': src_oovs_list,
            }
        else:
            train = {
                'src': (srcData, svoData),
                'src_extend_vocab': src_extend_vocab,
                'src_oovs_list': src_oovs_list,
            }
        return Dataset(train,
                       self.opt.batch_size,
                       self.opt.cuda,
                       volatile=True,
                       pointer_gen=self.opt.pointer_gen,
                       is_coverage=self.opt.is_coverage)
Exemple #4
0
def load_train_data():
    onlinePreprocess.seq_length = opt.max_sent_length_source  # 训练的截断
    onlinePreprocess.shuffle = 1 if opt.process_shuffle else 0
    train_data, vocab_dicts = prepare_data_online(opt)
    trainData = Dataset(train_data, opt.batch_size, opt.gpus, pointer_gen=opt.pointer_gen, is_coverage=opt.is_coverage)
    logger.info(' * vocabulary size. source = %d; target = %d' %
                (vocab_dicts['src'].size(), vocab_dicts['tgt'].size()))
    logger.info(' * number of training sentences. %d' %
                len(train_data['src']))
    return trainData, vocab_dicts
Exemple #5
0
class NeuralNetDatasetMaker:
    def __init__(self,
                 mode,
                 dir_model,
                 dataset_options,
                 balanced_datasets=True):
        self.dir_model = dir_model
        self.mode = mode
        self.dataset_options = dataset_options
        self.dataset = Dataset(self.dataset_options)
        self.balanced_datasets = balanced_datasets
        return

    def createDatasets(self):
        print('_getFilenameDatasetBalanced: ' + str(self.mode))
        filename_dataset_base = self.dataset_options.getFilename()
        filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(
            os.sep)[-1][:-4]
        if self.mode == 'traineval':
            if self.balanced_datasets:
                [df_training, df_testing
                 ] = self.dataset.getBalancedSubsetTrainingAndTesting()
                self.num_samples_train = df_training.shape[0]
                self.num_samples_validation = df_testing.shape[0]
                filename_train = filename_prefix + '_balanced_train.csv'
                filename_eval = filename_prefix + '_balanced_eval.csv'
                df_training.to_csv(filename_train,
                                   line_terminator='\n',
                                   index=False)
                df_testing.to_csv(filename_eval,
                                  line_terminator='\n',
                                  index=False)
                print(filename_train)
                print(filename_eval)
            else:
                [training, testing] = self.dataset.getTrainingAndTestingSet()
                df_training_pos = training[0]
                df_training_neg = training[1]
                df_eval_pos = testing[0]
                df_eval_neg = testing[1]
                self.num_samples_train = 2 * int(df_training_neg.shape[0])
                self.num_samples_validation = 2 * int(df_eval_neg.shape[0])
                filename_train_pos = filename_prefix + '_train_pos.csv'
                filename_train_neg = filename_prefix + '_train_neg.csv'
                filename_eval_pos = filename_prefix + '_eval_pos.csv'
                filename_eval_neg = filename_prefix + '_eval_neg.csv'
                df_training_pos.to_csv(filename_train_pos,
                                       line_terminator='\n',
                                       index=False)
                df_training_neg.to_csv(filename_train_neg,
                                       line_terminator='\n',
                                       index=False)
                df_eval_pos.to_csv(filename_eval_pos,
                                   line_terminator='\n',
                                   index=False)
                df_eval_neg.to_csv(filename_eval_neg,
                                   line_terminator='\n',
                                   index=False)
        else:
            if self.balanced_datasets:
                df_balanced = self.dataset.getBalancedSubSet()
                filename_dataset = filename_prefix + '_balanced_' + self.mode + '.csv'
                df_balanced.to_csv(filename_dataset,
                                   line_terminator='\n',
                                   index=False)
                print(filename_dataset)
            else:
                print('no valid configuration of datasets and mode..exit')
                sys.exit()

    def removeDatasets(self):
        filename_dataset_base = self.dataset_options.getFilename()
        filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(
            os.sep)[-1][:-4]
        if self.balanced_datasets:
            filename_dataset = filename_prefix + '_balanced_' + self.mode + '.csv'
            print('remove: ' + str(filename_dataset))
            os.remove(filename_dataset)
        else:
            print('no valid configuration of datasets and mode..exit')
            sys.exit()

    def _dfToFile(self, df, filename):
        list_df = [df[i:i + 10000] for i in range(0, df.shape[0], 10000)]
        list_df[0].to_csv(filename, index=False, line_terminator='\n')
        for l in list_df[1:]:
            l.to_csv(filename,
                     index=False,
                     line_terminator='\n',
                     header=False,
                     mode='a')

    def createDatasetsAutoEncoder(self):
        print('_getFilenameDatasetBalanced: ' + str(self.mode))
        filename_dataset_base = self.dataset_options.getFilename()
        filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(
            os.sep)[-1][:-4]
        if self.mode == 'traineval':
            df = self.dataset.getData()
            df = df.sample(frac=1)
            print('num samples: ' + str(df.shape[0]))
            print('df.shape: ' + str(df.shape))
            num_samples = df.shape[0]
            ratio_train_test = self.dataset_options.getRatioTrainingSamples()
            df_train = df[:int(round(ratio_train_test * num_samples))]
            df_eval = df[int(round(ratio_train_test * num_samples)):]
            filename_train = filename_prefix + '_balanced_train.csv'
            filename_eval = filename_prefix + '_balanced_eval.csv'
            self._dfToFile(df_train, filename_train)
            self._dfToFile(df_eval, filename_eval)
        else:
            filename_test = filename_prefix + '_test.csv'
            df = self.dataset.getData()
            df = df.sample(frac=1)
            self._dfToFile(df, filename_test)
Exemple #6
0
import os
import numpy as np
from utils.Dataset import Dataset
from model import model_multi_view
from utils.cluster import cluster
import csv
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(os.environ['CUDA_VISIBLE_DEVICES'])
'''
each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx)
ae net need pretraining before the whole optimizatoin
'''
if __name__ == '__main__':

    num = 30
    data = Dataset('coil_2views')
    x1, x2, gt = data.load_data()
    X = dict()
    X[str(0)], X[str(1)] = x1, x2
    acc_H_all = np.zeros(num)
    nmi_H_all = np.zeros(num)
    RI_H_all = np.zeros(num)
    f1_H_all = np.zeros(num)

    para_lambda = 1
    batch_size = X['0'].shape[0]
    lr_pre = 1.0e-3
    lr_ae = 1.0e-3
    lr_dg = 1.0e-3
    lr_h = 1.0e-2
    epochs_pre = 300
device = torch.device("cuda")

G = AEI_Net(512).to(device)
D = MultiscaleDiscriminator(input_nc=3,
                            ndf=64,
                            n_layers=6,
                            norm_layer=torch.nn.InstanceNorm2d).to(device)
G.train()
D.train()

arcface = Backbone(50, 0.6, 'ir_se').to(device)
arcface.eval()
arcface.load_state_dict(torch.load("./model_weights/model_ir_se50.pth"))

dataset = Dataset("./inputs/processed")

dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)

MSE = torch.nn.MSELoss()
L1 = torch.nn.L1Loss()


def hinge_loss(X, positive=True):
    if positive:
        return torch.relu(1 - X).mean()
    return torch.relu(X).mean()


def get_grid_image(X):
    X = X[:8]
class NeuralNetDatasetHandler:

    def __init__(self, dir_model, dataset_options, feature_columns, mode, balanced_datasets=True, resample_datasets=False):
        self.dir_model = dir_model;
        self.dataset_options = dataset_options;
        self.dataset = Dataset(self.dataset_options);
        self.feature_columns = feature_columns;
        self.mode = mode;
        self.balanced_datasets = balanced_datasets;
        self.resample_datasets = resample_datasets
        return;


    def _parse_csv(self, value):
        # print('Parsing', data_file)
        column_names = self.dataset.getColumnsData();
        default_values = self.feature_columns.getDefaultValues(column_names)
        columns = tf.decode_csv(value, record_defaults=default_values)
        features = dict(zip(column_names, columns))
        early_readmission_flagname = self.dataset_options.getEarlyReadmissionFlagname();
        labels = features.pop(early_readmission_flagname)
        return features, tf.equal(labels, 1)


    def _parse_csv_autoencoder(self, value):
        # print('Parsing', data_file)
        column_names = self.dataset.getColumnsData();
        default_values = self.feature_columns.getDefaultValues(column_names)
        columns = tf.decode_csv(value, record_defaults=default_values);
        features = dict(zip(column_names, columns))
        numeric_id_labels = features.pop('main_diag_ind');
        return features, tf.convert_to_tensor(numeric_id_labels);


    def _parse_csv_encode_maindiag(self, value):
        # print('Parsing', data_file)
        column_names = self.dataset.getColumnsData();
        default_values = self.feature_columns.getDefaultValues(column_names)
        columns = tf.decode_csv(value, record_defaults=default_values);
        features = dict(zip(column_names, columns))
        numeric_id_labels = features.pop('main_diag_ind');
        features = {'diag': features.pop('main_diag')};
        return features, tf.convert_to_tensor(numeric_id_labels);


    def _getFilenameDatasetBalanced(self):
        filename_dataset_base = self.dataset_options.getFilename();
        filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(os.sep)[-1][:-4];
        if self.mode == 'train':
            filename_train = filename_prefix + '_balanced_train.csv'
            filename = filename_train;
        elif self.mode == 'eval':
            filename_eval = filename_prefix + '_balanced_eval.csv'
            filename = filename_eval;
        elif self.mode == 'test':
            filename_test = filename_prefix + '_balanced_test.csv'
            filename = filename_test;
        else:
            print('unknown mode...exit')
            sys.exit();
        return filename;


    def _getFilenamesDatasetAll(self):
        filename_dataset_base = self.dataset_options.getFilename();
        filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(os.sep)[-1][:-4];
        if self.mode == 'train':
            filename_train_pos = filename_prefix + '_train_pos.csv'
            filename_train_neg = filename_prefix + '_train_neg.csv'
            filenames = [filename_train_pos, filename_train_neg];
        elif self.mode == 'eval':
            filename_eval_pos = filename_prefix + '_eval_pos.csv'
            filename_eval_neg = filename_prefix + '_eval_neg.csv'
            filenames = [filename_eval_pos, filename_eval_neg];
        elif self.mode == 'test':
            filename_test_pos = filename_prefix + '_test_pos.csv'
            filename_test_neg = filename_prefix + '_test_neg.csv'
            filenames = [filename_test_pos, filename_test_neg];
        else:
            print('unknown mode...exit')
            sys.exit();
        return filenames;


    def _getFilenameDatasetAutoEncoder(self):
        filename_dataset_base = self.dataset_options.getFilename();
        filename_prefix = self.dir_model + os.sep + filename_dataset_base.split(os.sep)[-1][:-4];
        if self.mode == 'train':
            filename_train = filename_prefix + '_balanced_train.csv'
            filename = filename_train;
        elif self.mode == 'eval':
            filename_eval = filename_prefix + '_balanced_eval.csv'
            filename = filename_eval;
        elif self.mode == 'test':
            filename_test = filename_prefix + '_test.csv'
            filename = filename_test;
        else:
            print('unknown mode...exit')
            sys.exit();
        return filename;


    def _dataset_reader(self):
        if self.balanced_datasets:
            filename_dataset = self._getFilenameDatasetBalanced()
            # shuffle is only performed for training; not optimal --> maybe five another flag to specify training/eval
            print('read: ' + str(filename_dataset))
            dataset = tf.data.TextLineDataset(filename_dataset)
            dataset = dataset.skip(1)
            if self.mode == 'train':
                dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamplesBalancedSubset())
            dataset = dataset.map(self._parse_csv, num_parallel_calls=5)
            return dataset;
        else:
            filenames_dataset = self._getFilenamesDatasetAll();
            data_file_pos = filenames_dataset[0];
            data_file_neg = filenames_dataset[1];

            # Extract lines from input files using the Dataset API.
            ds_pos = tf.data.TextLineDataset(data_file_pos)
            ds_neg = tf.data.TextLineDataset(data_file_neg)

            ds_pos = ds_pos.skip(1)
            ds_neg = ds_neg.skip(1)
            ds_neg = ds_neg.map(self._parse_csv, num_parallel_calls=5)
            ds_pos = ds_pos.map(self._parse_csv, num_parallel_calls=5)

            dataset = tf.data.Dataset.zip((ds_pos, ds_neg))

            # Each input element will be converted into a two-element `Dataset` using
            # `Dataset.from_tensors()` and `Dataset.concatenate()`, then `Dataset.flat_map()`
            # will flatten the resulting `Dataset`s into a single `Dataset`.
            dataset = dataset.flat_map(
                lambda ex_pos, ex_neg: tf.data.Dataset.from_tensors(ex_pos).concatenate(
                    tf.data.Dataset.from_tensors(ex_neg)))
            if self.mode == 'train':
                dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamplesBalancedSubset())
            return dataset;


    def _dataset_reader_autoencoder(self):
        if self.balanced_datasets:
            filename_dataset = self._getFilenameDatasetAutoEncoder();
            print(filename_dataset)
            # shuffle is only performed for training; not optimal --> maybe five another flag to specify training/eval
            dataset = tf.data.TextLineDataset(filename_dataset)
            dataset = dataset.skip(1)
            if self.mode == 'train':
                dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamples())
            dataset = dataset.map(self._parse_csv_autoencoder, num_parallel_calls=5)
            return dataset;
        else:
            filenames_dataset = self._getFilenameDatasetAutoEncoder();
            data_file_pos = filenames_dataset[0];
            data_file_neg = filenames_dataset[1];

            # Extract lines from input files using the Dataset API.
            ds_pos = tf.data.TextLineDataset(data_file_pos)
            ds_neg = tf.data.TextLineDataset(data_file_neg)

            ds_pos = ds_pos.skip(1)
            ds_neg = ds_neg.skip(1)
            ds_neg = ds_neg.map(self._parse_csv_autoencoder, num_parallel_calls=5)
            ds_pos = ds_pos.map(self._parse_csv_autoencoder, num_parallel_calls=5)

            dataset = tf.data.Dataset.zip((ds_pos, ds_neg))

            # Each input element will be converted into a two-element `Dataset` using
            # `Dataset.from_tensors()` and `Dataset.concatenate()`, then `Dataset.flat_map()`
            # will flatten the resulting `Dataset`s into a single `Dataset`.
            dataset = dataset.flat_map(
                lambda ex_pos, ex_neg: tf.data.Dataset.from_tensors(ex_pos).concatenate(
                    tf.data.Dataset.from_tensors(ex_neg)))
            if self.mode == 'train':
                dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamples())
            return dataset;


    def _dataset_reader_encode_main_diag(self):
        filename_dataset = self._getFilenameDatasetAutoEncoder();
        print(filename_dataset)
        # shuffle is only performed for training; not optimal --> maybe five another flag to specify training/eval
        dataset = tf.data.TextLineDataset(filename_dataset)
        dataset = dataset.skip(1)
        if self.mode == 'train':
            dataset = dataset.shuffle(buffer_size=self.dataset.getNumSamples())
        dataset = dataset.map(self._parse_csv_encode_maindiag, num_parallel_calls=5)
        return dataset;


    def update_model_dir(self, dir_model):
        self.dir_model = dir_model;


    def readDatasetTF(self):
        return self._dataset_reader();


    def readDatasetAE(self):
        return self._dataset_reader_autoencoder();


    def getDatasetEncodeMainDiag(self):
        return self._dataset_reader_encode_main_diag();
def encode(flags_obj):
    """Run Wide-Deep training and eval loop.
    Args:
    flags_obj: An object containing parsed flag values.
    """
    dict_data_training = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '20012016',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_training = DatasetOptions(dict_data_training)

    dict_data_encoding = {
        'dir_data': DIRPROJECT + 'data/',
        'data_prefix': 'nz',
        'dataset': '2017',
        'encoding': 'embedding',
        'newfeatures': None,
        'featurereduction': {
            'method': 'FUSION'
        },
        'grouping': 'verylightgrouping'
    }
    dataset_options_encoding = DatasetOptions(dict_data_encoding)

    feature_columns = FeatureColumnsAutoEncoderNZ(
        dataset_options=dataset_options_encoding)

    dict_dataset_options = {
        'train': dataset_options_training,
        'eval': None,
        'test': dataset_options_encoding
    }

    nn = AutoEncoderModel('test', dict_dataset_options, feature_columns,
                          flags_obj)
    diag_encodings = nn.encode()
    print('diag_encodings --> main diag: ' + str(diag_encodings[0].shape))
    print('diag_encodings --> secondary diags: ' +
          str(diag_encodings[1].shape))

    main_diag_encodings = diag_encodings[0]
    sec_diag_encodings = diag_encodings[1]

    dataset_encoding = Dataset(dataset_options_encoding)
    df_encoding = dataset_encoding.getDf()
    print('df_encoding: ' + str(df_encoding.shape))
    num_encoded_dim = main_diag_encodings.shape[1]

    dir_data = dataset_options_encoding.getDirData()
    dataset = dataset_options_encoding.getDatasetName()
    data_prefix = dataset_options_encoding.getDataPrefix()
    demographic_featurename = dataset_options_encoding.getFilenameOptionDemographicFeatures(
    )
    featureset_str = dataset_options_encoding.getFeatureSetStr()
    encoding = dataset_options_encoding.getEncodingScheme()
    name_event_column = dataset_options_encoding.getEventColumnName()

    name_main_diag = dataset_options_encoding.getNameMainDiag()
    name_sec_diag = dataset_options_encoding.getNameSecDiag()
    df_encoding_sec_diag = df_encoding[name_event_column].to_frame()
    df_encoding_main_diag = df_encoding[name_event_column].to_frame()

    num_encoded_dim = sec_diag_encodings.shape[1]
    for k in range(0, num_encoded_dim):
        new_col_secdiag = name_sec_diag + '_dim_' + str(k)
        df_encoding_sec_diag[new_col_secdiag] = sec_diag_encodings[:, k]

        new_col_maindiag = name_main_diag + '_dim_' + str(k)
        df_encoding_main_diag[new_col_maindiag] = main_diag_encodings[:, k]

    print('df_encoding_main_diag: ' + str(df_encoding_main_diag.shape))
    print('df_encoding_sec_diag: ' + str(df_encoding_sec_diag.shape))

    filename_sec_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_sec_diag + '_' + str(
        num_encoded_dim) + 'dim.csv'
    filename_main_diag_encoding = dir_data + 'data_' + data_prefix + '_' + dataset + '_' + name_main_diag + '_' + str(
        num_encoded_dim) + 'dim.csv'

    list_df = [
        df_encoding_sec_diag[i:i + 10000]
        for i in range(0, df_encoding_sec_diag.shape[0], 10000)
    ]
    list_df[0].to_csv(filename_sec_diag_encoding,
                      index=False,
                      line_terminator='\n')
    for l in list_df[1:]:
        l.to_csv(filename_sec_diag_encoding,
                 index=False,
                 line_terminator='\n',
                 header=False,
                 mode='a')

    list_df = [
        df_encoding_main_diag[i:i + 10000]
        for i in range(0, df_encoding_main_diag.shape[0], 10000)
    ]
    list_df[0].to_csv(filename_main_diag_encoding,
                      index=False,
                      line_terminator='\n')
    for l in list_df[1:]:
        l.to_csv(filename_main_diag_encoding,
                 index=False,
                 line_terminator='\n',
                 header=False,
                 mode='a')
Exemple #10
0
     logger.info('Cannot find preprocess data %s, program will shut down.',
                 '{}.preprocessed.pickle'.format(train_file_name_prefix))
     sys.exit()
 dev_file_name_prefix, fileExist = checkPreprocessFile(
     dev_file, add_query_node)
 if not fileExist:
     logger.info('Cannot find preprocess data %s, program will shut down.',
                 '{}.preprocessed.pickle'.format(dev_file_name_prefix))
     sys.exit()
 if not evaluation_mode:
     logger.info('Loading preprocessed training data file %s',
                 '{}.preprocessed.pickle'.format(train_file_name_prefix))
     dataset = Dataset(train_file_name_prefix,
                       use_elmo,
                       use_glove,
                       use_extra_feature,
                       max_nodes=500,
                       max_query_size=25,
                       max_candidates=80,
                       max_candidates_len=10)
     logger.info('Loading preprocessed development data file %s',
                 '{}.preprocessed.pickle'.format(dev_file_name_prefix))
     dev_dataset = Dataset(dev_file_name_prefix,
                           use_elmo,
                           use_glove,
                           use_extra_feature,
                           max_nodes=500,
                           max_query_size=25,
                           max_candidates=80,
                           max_candidates_len=10)
 else:
     logger.info('Loading preprocessed evaluation data file %s',
Exemple #11
0
Description: Nothing
FilePath: /Signal-1/AE2-Nets-master/test_CUB.py
'''
from utils.Dataset import Dataset
from AE_BinAE_revise import MaeAEModel
from model import model
from utils.print_result import print_result
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
'''
each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx)
ae net need pretraining before the whole optimization
'''
if __name__ == '__main__':
    data = Dataset('CUB_c10_2views')
    x1, x2, gt = data.load_data()
    x1 = data.normalize(x1, 0)
    x2 = data.normalize(x2, 0)
    n_clusters = len(set(gt))
    print(x1.shape)
    print(x2.shape)
    print(gt.shape)
    #act_ae1, act_ae2, act_dg1, act_dg2 = 'sigmoid', 'sigmoid', 'sigmoid', 'sigmoid'
    v1_aedims_ = [[x1.shape[1], 512, 256], [256, 512, x1.shape[1]]]

    v2_aedims_ = [[x2.shape[1], 256, 128], [128, 256, x2.shape[1]]]
    #原来的
    mae_dims_ = [[256, 128, 64], [128, 128, 64], [64, 128, 256],
                 [64, 128, 128]]
    #现在用的
Exemple #12
0
G = AEI_Net(512).to(device)
D = MultiscaleDiscriminator(input_nc=3,
                            ndf=64,
                            n_layers=6,
                            norm_layer=torch.nn.InstanceNorm2d).to(device)
G.train()
D.train()

arcface = Backbone(50, 0.6, 'ir_se').to(device)
arcface.eval()
arcface.load_state_dict(torch.load("./model_weights/model_ir_se50.pth"))

opt_G = optim.Adam(G.parameters(), lr=lr_G, betas=(0, 0.999))
opt_D = optim.Adam(D.parameters(), lr=lr_D, betas=(0, 0.999))

dataset = Dataset("./dataset/celeb/", same_prob=0.2)

dataloader = DataLoader(dataset,
                        batch_size=batch_size,
                        shuffle=True,
                        num_workers=0,
                        drop_last=True)

MSE = torch.nn.MSELoss()
L1 = torch.nn.L1Loss()


def hinge_loss(X, positive=True):
    if positive:
        return torch.relu(1 - X).mean()
    return torch.relu(X).mean()
def train():
    print("*"*100)
    print("train begin")
    # use gpu
    use_gpu = args.device is not None
    if torch.cuda.is_available() and not use_gpu:
        print("WARNING: You have a CUDA device, should run with -device 0")
    if use_gpu:
        # set cuda device and seed
        torch.cuda.set_device(args.device)
    torch.cuda.manual_seed(args.seed)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    numpy.random.seed(args.seed)
    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)

    # 路径准备
    embedding_file_path = os.path.join(args.project, "embedding.npz")
    vocab_file_path = os.path.join(args.project, "word2id.json")
    end_train_file = os.path.join(args.input, "train_files", "train.txt")
    train_files_dir = os.path.join(args.input, "train_files")

    # 合并同后缀文本文件
    merge_same_suf_text_file(train_files_dir, end_train_file, '.txt')

    print('Loading vocab,train and val dataset.Wait a second,please')
    embed = torch.Tensor(np.load(embedding_file_path)['arr_0'])  # embed = torch.Tensor(list(np.load(args.embedding)))
    with open(vocab_file_path) as f:
        word2id = json.load(f)
    vocab = Vocab(embed, word2id)
    with open(end_train_file) as f:
        examples = list()
        for line in tqdm(f):
            if line and not line.isspace():
                examples.append(json.loads(line))
    train_dataset = Dataset(examples)
    print(train_dataset[:1])

    args.embed_num = embed.size(0)  # 从embeding中读取维度
    args.embed_dim = embed.size(1)  #
    args.kernel_sizes = [int(ks) for ks in args.kernel_sizes.split(',')]
    net = getattr(models, args.model)(args, embed)
    if use_gpu:
        net.cuda()
    train_iter = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=False)
    criterion = nn.BCELoss()
    params = sum(p.numel() for p in list(net.parameters())) / 1e6
    print('#Params: %.1fM' % (params))

    min_loss = float('inf')
    optimizer = torch.optim.Adam(net.parameters(), lr=args.learning_rate)
    net.train()

    t1 = time()
    for epoch in range(1, args.max_epoch + 1):
        print("*"*10, 'epoch ', str(epoch), '*'*50)
        for i, batch in enumerate(train_iter):
            print("*"*10, 'batch', i, '*'*10)
            features, targets, _, doc_lens = vocab.make_features(batch, args.seq_trunc)
            features, targets = Variable(features), Variable(targets.float())
            if use_gpu:
                features = features.cuda()
                targets = targets.cuda()
            probs = net(features, doc_lens)
            loss = criterion(probs, targets)
            optimizer.zero_grad()
            loss.backward()
            clip_grad_norm(net.parameters(), args.max_norm)
            optimizer.step()
            net.save()
            print('Epoch: %2d Loss: %f' % (epoch, loss))
    t2 = time()
    print('Total Cost:%f h' % ((t2 - t1) / 3600))
    print("模型配置文件保存至输出文件夹")
Exemple #14
0
import os
import numpy as np
from utils.Dataset import Dataset
from model import model_multi_view
from utils.cluster import cluster
import csv
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(os.environ['CUDA_VISIBLE_DEVICES'])
'''
each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx)
ae net need pretraining before the whole optimizatoin
'''
if __name__ == '__main__':

    num = 30
    data = Dataset('handwritten_6views')
    X, gt = data.load_data()

    acc_H_all = np.zeros(num)
    nmi_H_all = np.zeros(num)
    RI_H_all = np.zeros(num)
    f1_H_all = np.zeros(num)

    para_lambda = 1
    batch_size = 2000
    lr_pre = 1.0e-3
    lr_ae = 1.0e-3
    lr_dg = 1.0e-3
    lr_h = 1.0e-1
    epochs_pre = 10
    epochs_total = 20
    }
    dict_options_dataset_testing = {
        'dir_data': dirData,
        'data_prefix': 'patrec',
        'dataset': '20162017',
        'encoding': 'categorical',
        'newfeatures': {
            'names': constantsPATREC.NEW_FEATURES
        },
        'featurereduction': None,
        'grouping': 'verylightgrouping',
        'filtering': 'EntlassBereich_Gyn'
    }

    options_training = DatasetOptions(dict_options_dataset_training)
    dataset_training = Dataset(dataset_options=options_training)

    dict_opt_rf = {
        'n_estimators': 500,
        'max_depth': 50
    }
    options_rf = OptionsRF(
        dirModelsBase,
        options_training.getFilenameOptions(filteroptions=True),
        options_clf=dict_opt_rf)
    clf_rf = ClassifierRF(options_rf)

    dict_opt_lr = {
        'penalty': 'l1',
        'C': 0.5
    }
        dirModelsBase,
        options_training.getFilenameOptions(filteroptions=True),
        options_clf=dict_opt_sgd)
    clf_sgd = ClassifierSGD(options_sgd)

    dict_options_dataset_training = {
        'dir_data': dirData,
        'data_prefix': 'nz',
        'dataset': '2016',
        'newfeatures': {
            'names': constantsNZ.NEW_FEATURES
        },
        'featurereduction': None
    }
    options_testing = DatasetOptions(dict_options_dataset_training)
    dataset_testing = Dataset(dataset_options=options_testing)

    years = [2012, 2013, 2014, 2015]
    for year in years:
        dict_options_dataset_training = {
            'dir_data': dirData,
            'data_prefix': 'nz',
            'dataset': str(year),
            'newfeatures': {
                'names': constantsNZ.NEW_FEATURES
            },
            'featurereduction': None
        }

        options_training = DatasetOptions(dict_options_dataset_training)
        dataset_training = Dataset(dataset_options=options_training)
Exemple #17
0
parser.add_argument('--gallery_feature_dir', type=str)
parser.add_argument('--query_feature_dir', type=str)
parser.add_argument('--useCAM', action='store_true')

args = parser.parse_args()

data_transforms = transforms.Compose([
    transforms.Resize((args.img_h, args.img_w)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# image_datasets = {x: datasets.ImageFolder(os.path.join(args.test_dir, x) ,data_transforms) for x in ['gallery','query']}
image_datasets = {
    x: Dataset(os.path.join(args.test_dir, x),
               data_transforms,
               CAM=args.useCAM)
    for x in ['gallery', 'query']
}
# labelsloader = {x: iter(image_datasets[x].imgs) for x in ['gallery', 'query']}
dataloaders = {
    x: torch.utils.data.DataLoader(image_datasets[x],
                                   batch_size=args.batch_size,
                                   shuffle=False,
                                   num_workers=4)
    for x in ['gallery', 'query']
}


def load_network(network):
    save_path = os.path.join(args.model_save_dir,
Exemple #18
0
FilePath: /Signal-1/AE2-Nets-master/test_Caltech.py
'''
from utils.Dataset import Dataset
from AE_BinAE_revise import MaeAEModel
from model import model
from utils.print_result import print_result
import os
from collections import Counter

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
'''
each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx)
ae net need pretraining before the whole optimization
'''
if __name__ == '__main__':
    data = Dataset('Caltech101_7_2views')
    x1, x2, gt = data.load_data()
    x1 = data.normalize(x1, 0)
    x2 = data.normalize(x2, 0)
    n_clusters = len(set(gt))
    print(x1.shape)
    print(x2.shape)
    print(n_clusters)
    #act_ae1, act_ae2, act_dg1, act_dg2 = 'sigmoid', 'sigmoid', 'sigmoid', 'sigmoid'
    v1_aedims_ = [[x1.shape[1], 1024, 512, 256], [256, 512, 1024, x1.shape[1]]]

    v2_aedims_ = [[x2.shape[1], 256, 128], [128, 256, x2.shape[1]]]
    #原来的
    mae_dims_ = [[256, 256], [128, 128, 64], [256, 256], [64, 128, 128]]
    #现在用的
    #dims_dg1 = [64, 100]
Exemple #19
0
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader
from config.test_config import TestConfig
import os
import numpy as np
from PIL import Image

opt = TestConfig().parse()
model = CycleGAN(opt)
model.load_state_dict(
    torch.load('log/snapshot/' + opt.name + '_snapshot_' + str(opt.epoch) +
               '.pkl'))
model.eval()
model.cuda()
dataset = Dataset(opt)
data_loader = DataLoader(dataset,
                         batch_size=1,
                         shuffle=opt.shuffle,
                         num_workers=4)
pic_dir = opt.pic_dir

for iteration, input in enumerate(data_loader):
    model.deal_with_input(input)
    model.test()
    g_A = model.generated_A.cpu().numpy()
    g_B = model.generated_B.cpu().numpy()
    c_A = model.cycled_A.cpu().numpy()
    c_B = model.cycled_B.cpu().numpy()
    #g_A = Image.fromarray(((g_A+1.)/2.*255).astype(np.uint8).transpose(1,2,0))
    #g_A.save(os.path.join(pic_dir, 'generated_A_'+str(opt.epoch)+'.png'))
Exemple #20
0
import os
import numpy as np
from utils.Dataset import Dataset
from model import model_multi_view
from utils.cluster import cluster
import csv
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
print(os.environ['CUDA_VISIBLE_DEVICES'])
'''
each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx)
ae net need pretraining before the whole optimizatoin
'''
if __name__ == '__main__':

    num = 30
    data = Dataset('ORL_3views')
    X, gt = data.load_data()

    acc_H_all = np.zeros(num)
    nmi_H_all = np.zeros(num)
    RI_H_all = np.zeros(num)
    f1_H_all = np.zeros(num)

    para_lambda = 1
    batch_size = X['0'].shape[0]
    lr_pre = 1.0e-3
    lr_ae = 1.0e-3
    lr_dg = 1.0e-3
    lr_h = 1.0e-2
    epochs_pre = 50
    epochs_total = 200
def learn(
        env,
        policy_fn,
        *,
        timesteps_per_actorbatch,  # timesteps per actor per update
        optim_stepsize,
        optim_batchsize,  # optimization hypers
        gamma,
        lam,  # advantage estimation
        entcoeff=0.0,
        max_episodes=0,
        max_iters=0,
        max_seconds=0,  # time constraint
        callback=None,  # you can do anything in the callback, since it takes locals(), globals()
        adam_epsilon=1e-5,
        schedule='constant',  # annealing for stepsize parameters (epsilon and adam)
        args):
    # Setup losses and stuff`
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space,
                   ac_space)  # Construct network for new policy
    oldpi = policy_fn("oldpi", ob_space, ac_space)  # Network for old policy

    # Ops to reassign params from new to old
    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    lrmult = tf.placeholder(
        name='lrmult', dtype=tf.float32,
        shape=[])  # learning rate multiplier, updated with schedule

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    pol_entpen = (-entcoeff) * meanent

    newprob = tf.exp(pi.pd.logp(ac))
    oldprob = tf.exp(oldpi.pd.logp(ac))

    ratio = newprob / oldprob

    kl = pi.pd.kl(oldpi.pd)
    mean_kl = tf.reduce_mean(kl)
    get_kl = U.function([ob, ac], kl)
    get_mean_kl = U.function([ob, ac], mean_kl)

    threshold = kl < args.kl_threshold
    threshold = tf.cast(threshold, tf.float32)

    pol_surr = (kl - ratio * atarg / args.sepg_lam) * threshold

    pol_surr = tf.reduce_mean(pol_surr)

    vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret))
    total_loss = pol_surr + pol_entpen + vf_loss
    losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
    loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]

    var_list = pi.get_trainable_variables()
    lossandgrad = U.function([ob, ac, atarg, ret, lrmult],
                             losses + [U.flatgrad(total_loss, var_list)])

    adam = MpiAdam(var_list, epsilon=adam_epsilon)

    compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)

    U.initialize()
    adam.sync()

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_actorbatch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100)  # rolling buffer for episode rewards

    running_scores = []

    assert sum([
        max_iters > 0, args.num_timesteps > 0, max_episodes > 0,
        max_seconds > 0
    ]) == 1, "Only one time constraint permitted"

    while True:
        if callback: callback(locals(), globals())
        if args.num_timesteps and timesteps_so_far >= args.num_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        elif max_seconds and time.time() - tstart >= max_seconds:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult = max(
                1.0 - float(timesteps_so_far) / args.num_timesteps, 0)
        else:
            raise NotImplementedError

        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.log("********** Iteration %i ************" % iters_so_far)

        seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / (
            atarg.std() + 1e-8)  # standardized advantage function estimate

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        assign_old_eq_new()  # set old parameter values to new parameter values

        d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret),
                    shuffle=not pi.recurrent)

        # Here we do a bunch of optimization epochs over the data
        for num_epoch in count():
            losses = [
            ]  # list of tuples, each of which gives the loss for a minibatch
            for batch in d.iterate_once(optim_batchsize):
                *newlosses, g = lossandgrad(batch["ob"], batch["ac"],
                                            batch["atarg"], batch["vtarg"],
                                            cur_lrmult)
                g = np.nan_to_num(g)
                adam.update(g, optim_stepsize * cur_lrmult)
                losses.append(newlosses)

            agg_mean_kl = get_mean_kl(ob, ac)

            if agg_mean_kl > args.agg_kl_threshold or num_epoch == args.optim_epochs:
                break

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))

        rewbuffer.extend(rews)

        mean_score = None

        if rewbuffer:
            mean_score = np.mean(rewbuffer)
            running_scores.append((timesteps_so_far, mean_score))

        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        if MPI.COMM_WORLD.Get_rank() == 0:
            logger.record_tabular("EpRewMean", mean_score)
            logger.record_tabular("EpThisIter", len(lens))
            logger.record_tabular("EpisodesSoFar", episodes_so_far)
            logger.record_tabular("TimestepsSoFar", timesteps_so_far)
            logger.record_tabular("TimeElapsed", time.time() - tstart)
            logger.record_tabular("NumEpoch", num_epoch)

            logger.dump_tabular()

    return running_scores
Exemple #22
0
from utils.DatasetFilter import DatasetFilter
from utils.Dataset import Dataset
from utils.DatasetOptions import DatasetOptions

import helpers.constants as constants
import helpers.constantsNZ as constantsNZ

dirProject = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + '/'
dirData = dirProject + 'data/'
dirPlotsBase = dirProject + 'plots/feature_comparison_wiederkehrer_normal/'

dict_options_analyzing = {
    'dir_data': dirData,
    'data_prefix': 'patrec',
    'dataset': '20122015',
    'grouping': 'verylightgrouping',
    'encoding': 'categorical',
    'newfeatures': {
        'names': constants.NEW_FEATURES
    },
    'featurereduction': None,
    'filter_options': 'chronic_lung'
}

options = DatasetOptions(dict_options_analyzing)
dataset = Dataset(options)

datafilter = DatasetFilter(options)
datafilter.filterDataDisease()
#Email:		[email protected]
#Date:		Min 13 Des 2020 02:50:08  WIB

from model.nn import NNModel
from cf.DiCE import DiCE
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from utils.Dataset import Dataset
from utils.adult_dataset import load_adult_income

if __name__ == "__main__":

    income_df = load_adult_income("data/adult/adult.csv")
    d = Dataset(dataframe=income_df,
                continuous_features=[
                    'age', 'education', 'educational-num', 'capital-gain',
                    'capital-loss', 'hours-per-week', 'native-country'
                ],
                outcome_name='income',
                scaler=MinMaxScaler())
    clf = NNModel(model_path='weights/adult.pth')
    cf = DiCE(d, clf)
    test_instance = {
        'age': 57,
        'workclass': 'Self-Employed',
        'education': 2,
        'educational-num': 10,
        'marital-status': 'Married',
        'occupation': 'Service',
        'relationship': 'Husband',
        'race': 'White',
        'gender': 'Male',
 def __init__(self, dataset_options, dir_plots):
     self.dataset_options = dataset_options
     self.dataset = Dataset(dataset_options=dataset_options)
     self.dir_plots = dir_plots
     return
Exemple #25
0
def test_item_file(end_test_file, embedding_file_path, vocab_file_path,
                   use_gpu):
    embed = torch.Tensor(np.load(embedding_file_path)['arr_0'])
    with open(vocab_file_path) as f:
        word2id = json.load(f)
    vocab = Vocab(embed, word2id)
    #with open(end_test_file) as f:
    #    examples = [json.loads(line) for line in f]
    with open(end_test_file) as f:
        examples = list()
        for line in f:
            if line and not line.isspace():
                examples.append(json.loads(line))
    #print(examples[0])
    test_dataset = Dataset(examples)

    test_iter = DataLoader(dataset=test_dataset,
                           batch_size=args.batch_size,
                           shuffle=False)
    load_dir = os.path.join(args.input, 'model_files', 'CNN_RNN.pt')
    if use_gpu:
        checkpoint = torch.load(load_dir)
    else:
        checkpoint = torch.load(load_dir,
                                map_location=lambda storage, loc: storage)
    if not use_gpu:
        checkpoint['args'].device = None
    net = getattr(models, checkpoint['args'].model)(checkpoint['args'])
    net.load_state_dict(checkpoint['model'])
    if use_gpu:
        net.cuda()
    net.eval()
    doc_num = len(test_dataset)

    all_targets = []
    all_results = []
    all_probs = []
    all_acc = []
    all_p = []
    all_r = []
    all_f1 = []
    all_sum = []
    for batch in tqdm(test_iter):
        features, targets, summaries, doc_lens = vocab.make_features(batch)
        if use_gpu:
            probs = net(Variable(features).cuda(), doc_lens)
        else:
            probs = net(Variable(features), doc_lens)
        start = 0
        for doc_id, doc_len in enumerate(doc_lens):
            doc = batch['doc'][doc_id].split('\n')[:doc_len]
            stop = start + doc_len
            prob = probs[start:stop]
            hyp = []
            for _p, _d in zip(prob, doc):
                print(_p)
                print(_d)
                if _p > 0.5:
                    hyp.append(_d)
            if len(hyp) > 0:
                print(hyp)
                all_sum.append("###".join(hyp))
            else:
                all_sum.append('')
            all_targets.append(targets[start:stop])
            all_probs.append(prob)
            start = stop
    file_path_elems = end_test_file.split('/')
    file_name = 'TR-' + file_path_elems[len(file_path_elems) - 1]
    with open(os.path.join(args.output, file_name), mode='w',
              encoding='utf-8') as f:
        for text in all_sum:
            f.write(text.strip() + '\n')
    for item in all_probs:
        all_results.append([1 if tmp > 0.5 else 0 for tmp in item.tolist()])
    print(len(all_results))
    print(len(all_targets))
    print(len(all_probs))
    for _1, _2, _3 in zip(all_results, all_targets, all_probs):
        _2 = _2.tolist()
        _3 = _3.tolist()
        print("*" * 3)
        print('probs : ', _3)
        print('results : ', _1)
        print('targets : ', _2)
        tmp_acc = accuracy_score(_1, _2)
        tmp_p = precision_score(_1, _2)
        tmp_r = recall_score(_1, _2)
        tmp_f1 = f1_score(_1, _2)
        print('acc : ', tmp_acc)
        print('p : ', tmp_p)
        print('r : ', tmp_r)
        print('f1 : ', tmp_f1)
        all_acc.append(tmp_acc)
        all_p.append(tmp_p)
        all_r.append(tmp_r)
        all_f1.append(tmp_f1)
    print('all dataset acc : ', np.mean(all_acc))
    print('all dataset p : ', np.mean(all_p))
    print('all dataset r : ', np.mean(all_r))
    print('all dataset f1 : ', np.mean(all_f1))
    print('all results length : ', len(all_results))
class DataAnalyzer:
    def __init__(self, dataset_options, dir_plots):
        self.dataset_options = dataset_options
        self.dataset = Dataset(dataset_options=dataset_options)
        self.dir_plots = dir_plots
        return

    def _printValues(self, category_names, occ_wiederkehrer, occ_normal):
        for k, name in enumerate(category_names):
            print(name + ': ' + str(occ_wiederkehrer[k]) + ' <-> ' +
                  str(occ_normal[k]))

    def _getFeatureValues(self, df, name_feature):
        column_names = self.dataset.getColumnsDf()
        feature_columns = []
        for col in column_names:
            if col.startswith(name_feature):
                feature_columns.append(col)
        df_feature = df[feature_columns]
        df_feature_wiederkehrer = df_feature.loc[df['Wiederkehrer'] == 1]
        df_feature_normal = df_feature.loc[df['Wiederkehrer'] == 0]
        return [df_feature_normal, df_feature_wiederkehrer]

    def _filterDFdisease(self, feature_name, feature_categories,
                         df_feature_normal, df_feature_wiederkehrer):
        print(df_feature_wiederkehrer.shape)
        print(df_feature_normal.shape)
        series_normal = []
        series_wiederkehrer = []
        for cat in feature_categories:
            series_normal.append(df_feature_normal[feature_name + '_' + cat])
            series_wiederkehrer.append(df_feature_wiederkehrer[feature_name +
                                                               '_' + cat])

        df_feature_normal_filtered = pd.concat(series_normal, axis=1)
        df_feature_wiederkehrer_filtered = pd.concat(series_wiederkehrer,
                                                     axis=1)
        return [df_feature_normal_filtered, df_feature_wiederkehrer_filtered]

    # for categorical features
    def _doComparisonBar(self, df, name_feature):
        filename_plot = self.dir_plots + 'featurecomparison_' + name_feature + '.png'
        print(name_feature)
        categories_feature = self.dataset_options.getFeatureCategories(
            name_feature)
        if name_feature == self.dataset_options.getNameMainDiag():
            if self.dataset_options.getOptionsFiltering(
            ) in self.dataset_options.getDiseaseNames():
                categories_feature = self.dataset_options.getDiseaseICDkeys()
        print(categories_feature)
        values_to_count = range(0, len(categories_feature))

        [df_feature_normal,
         df_feature_wiederkehrer] = self._getFeatureValues(df, name_feature)
        if df_feature_wiederkehrer.shape[1] > 0 and df_feature_normal.shape[
                1] > 0:
            if name_feature == self.dataset_options.getNameMainDiag():
                if self.dataset_options.getOptionsFiltering(
                ) in self.dataset_options.getDiseaseNames():
                    [df_feature_normal, df_feature_wiederkehrer
                     ] = self._filterDFdisease(name_feature,
                                               categories_feature,
                                               df_feature_normal,
                                               df_feature_wiederkehrer)
            num_feature_normal = df_feature_normal.shape[0]
            num_feature_wiederkehrer = df_feature_wiederkehrer.shape[0]
            occ_feature_wiederkehrer = df_feature_wiederkehrer.sum(axis=0)
            occ_feature_normal = df_feature_normal.sum(axis=0)

            self._printValues(categories_feature, occ_feature_wiederkehrer,
                              occ_feature_normal)

            occ_wiederkehrer = occ_feature_wiederkehrer.values
            occ_normal = occ_feature_normal.values
            density_normal = occ_normal / float(num_feature_normal)
            density_wiederkehrer = occ_wiederkehrer / float(
                num_feature_wiederkehrer)

            print(len(values_to_count))
            print(density_wiederkehrer.shape)

            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 10))
            plt.bar(values_to_count,
                    height=density_wiederkehrer.flatten(),
                    width=1.0,
                    align='center',
                    color='b',
                    alpha=0.5)
            plt.bar(values_to_count,
                    height=density_normal.flatten(),
                    width=1.0,
                    align='center',
                    color='m',
                    alpha=0.5)
            plt.xlim([-1, len(categories_feature) + 1])
            plt.xticks(range(0, len(values_to_count)), categories_feature)
            plt.legend(['Wiederkehrer', 'normal'])
            plt.title(name_feature)
            plt.draw()
            plt.savefig(filename_plot, format='png')
            plt.close()

    # for numerical features
    def _doComparisonHist(self, df, name_feature):
        filename_plot = self.dir_plots + 'featurecomparison_' + name_feature + '.png'
        print(name_feature)

        [df_feature_normal,
         df_feature_wiederkehrer] = self._getFeatureValues(df, name_feature)
        if df_feature_wiederkehrer.shape[1] > 0 and df_feature_normal.shape[
                1] > 0:
            num_values_normal = df_feature_normal.shape[0]
            num_values_wiederkehrer = df_feature_wiederkehrer.shape[0]
            values_wiederkehrer = df_feature_wiederkehrer.values
            values_normal = df_feature_normal.values

            print('normal: ' + str(df_feature_normal.shape))
            print('normal: ' + str(df_feature_wiederkehrer.shape))

            if num_values_normal > 0 and num_values_wiederkehrer > 0:
                min_value = float(
                    min(min(values_normal), min(values_wiederkehrer)))
                max_value = float(
                    max(max(values_normal), max(values_wiederkehrer)))
            elif num_values_wiederkehrer > 0:
                min_value = float(min(values_wiederkehrer))
                max_value = float(max(values_wiederkehrer))
            elif num_values_normal > 0:
                min_value = float(min(values_normal))
                max_value = float(max(values_normal))
            else:
                pass

            num_different_values = np.unique(
                np.vstack([values_wiederkehrer, values_normal])).shape[0]
            if num_different_values > 100:
                num_bins_hist = 100
            else:
                num_bins_hist = num_different_values

            print('min value: ' + str(min_value))
            print('max value: ' + str(max_value))

            range_hist = [min_value, max_value]
            # print(bins_hist)
            hist_feature_wiederkehrer, bins_wiederkehrer = np.histogram(
                values_wiederkehrer,
                range=range_hist,
                bins=num_bins_hist,
                density=True)
            hist_feature_normal, bins_normal = np.histogram(values_normal,
                                                            range=range_hist,
                                                            bins=num_bins_hist,
                                                            density=True)
            hist_feature_wiederkehrer = hist_feature_wiederkehrer / hist_feature_wiederkehrer.sum(
            )
            hist_feature_normal = hist_feature_normal / hist_feature_normal.sum(
            )

            bar_width_wiederkehrer = bins_wiederkehrer[
                1:] - bins_wiederkehrer[:-1]
            bar_width_normal = bins_normal[1:] - bins_normal[:-1]

            fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(14, 10))
            plt.bar(bins_wiederkehrer[:-1],
                    height=hist_feature_wiederkehrer,
                    width=bar_width_wiederkehrer,
                    align='edge',
                    color='b',
                    alpha=0.5)
            plt.bar(bins_normal[:-1],
                    height=hist_feature_normal,
                    width=bar_width_normal,
                    align='edge',
                    color='m',
                    alpha=0.5)
            plt.legend(['Wiederkehrer', 'normal'])
            plt.title(name_feature)
            plt.draw()
            plt.savefig(filename_plot, format='png')
            plt.close()

    # ideal would be to automatically select the comparison type from the feature name
    # would need to give a flag with the feature name
    # i dont know if that would be practical in the long run
    # but like this it is not ideal either
    def doFeatureComparison(self):
        df = self.dataset.getDf()

        df_wiederkehrer = df['Wiederkehrer']
        print('num_wiederkehrer: ' + str(df_wiederkehrer.sum(axis=0)))

        self._doComparisonHist(df, 'ratio_los_age')
        self._doComparisonHist(df, 'ratio_numDK_age')
        self._doComparisonHist(df, 'ratio_numOE_age')
        self._doComparisonHist(df, 'ratio_los_numDK')
        self._doComparisonHist(df, 'ratio_los_numOE')
        self._doComparisonHist(df, 'mult_los_numCHOP')
        self._doComparisonHist(df, 'ratio_numCHOP_age')
        self._doComparisonHist(df, 'Eintrittsalter')
        self._doComparisonHist(df, 'Verweildauer')
        self._doComparisonHist(df, 'numDK')
        self._doComparisonHist(df, 'numOE')
        self._doComparisonHist(df, 'numCHOP')
        self._doComparisonHist(df, 'Langlieger')
        self._doComparisonHist(df, 'equalOE')
        self._doComparisonHist(df, 'previous_visits')
        self._doComparisonHist(df, 'diff_drg_alos')
        self._doComparisonHist(df, 'diff_drg_lowerbound')
        self._doComparisonHist(df, 'diff_drg_upperbound')
        self._doComparisonHist(df, 'rel_diff_drg_alos')
        self._doComparisonHist(df, 'rel_diff_drg_lowerbound')
        self._doComparisonHist(df, 'rel_diff_drg_upperbound')
        self._doComparisonHist(df, 'alos')
        self._doComparisonHist(df, 'ratio_drg_los_alos')

        self._doComparisonBar(df, 'EntlassBereich')
        self._doComparisonBar(df, 'Versicherungsklasse')
        self._doComparisonBar(df, 'Geschlecht')
        self._doComparisonBar(df, 'Forschungskonsent')
        self._doComparisonBar(df, 'Entlassjahr')
        self._doComparisonBar(df, 'Entlassmonat')
        self._doComparisonBar(df, 'Entlasstag')
        self._doComparisonBar(df, 'Aufnahmeart')
        self._doComparisonBar(df, 'Entlassart')
        self._doComparisonBar(df, 'Eintrittsart')
        self._doComparisonBar(df, 'Liegestatus')
        self._doComparisonBar(df, 'Hauptdiagnose')
        # self._doComparisonBar(df, 'CHOP');

    def _getRatioWiederkehrerFlag(self):
        early_readmission_flag = self.dataset_options.getEarlyReadmissionFlagname(
        )
        df = self.dataset.getDf()
        df_wiederkehrer = df[early_readmission_flag]
        num_wiederkehrer = int(df_wiederkehrer.sum(axis=0))
        num_all = int(df.shape[0])
        print('num all: ' + str(num_all))
        print('num_wiederkehrer: ' + str(df_wiederkehrer.sum(axis=0)))
        print('ratio wiederkehrer: ' +
              str(float(num_wiederkehrer) / float(num_all)))

    def _getRatio18DaysReturn(self):
        df = self.dataset.getDf()
        df = df.sort_values(by=['Patient', 'Aufnahmedatum'])
        patient_ids_wiederkehrer = df['Patient'].unique()
        single_visiting_patients = 0
        for k in range(0, len(patient_ids_wiederkehrer)):
            p_id = patient_ids_wiederkehrer[k]
            cases_df = df.loc[df['Patient'] == p_id]
            new_patient = True
            if cases_df.shape[0] == 1:
                single_visiting_patients += 1
            for index, row in cases_df.iterrows():
                if not new_patient:
                    timestamp_enter = row['Aufnahmedatum']
                    diff = (datetime.fromtimestamp(timestamp_enter) -
                            datetime.fromtimestamp(timestamp_previous_exit))
                    days = diff.days
                    if int(days) <= 18:
                        # print(str(datetime.fromtimestamp(timestamp_enter).strftime("%y,%m,%d")) + ' vs. ' + str(datetime.fromtimestamp(timestamp_previous_exit).strftime("%y,%m,%d")))
                        # print(str(int(row['Patient'])) + ': ' + ' --> ' + str(days) + ' --> ' + str(row['Wiederkehrer']))
                        df.at[index_previous, 'Wiederkehrer'] = 1
                else:
                    new_patient = False
                timestamp_previous_exit = row['Entlassdatum']
                index_previous = index

        num_wiederkehrer_all = int(df['Wiederkehrer'].sum(axis=0))
        num_all = int(df.shape[0])
        print('patients with only a single visit: ' +
              str(single_visiting_patients))
        print('num all: ' + str(num_all))
        print('num wiedekehrer all: ' + str(num_wiederkehrer_all))
        print('ratio wiederkehrer all: ' +
              str(float(num_wiederkehrer_all) / float(num_all)))

    def checkWiederkehrer(self):
        self._getRatioWiederkehrerFlag()
        if self.dataset_options.getDataPrefix() == 'patrec':
            self._getRatio18DaysReturn()

    def _getNumberColumnsSubgroupPatrec(self, subgroup):
        dir_data = self.dataset_options.getDirData()
        dataset = self.dataset_options.getDatasetName()
        chunksize = self.dataset_options.getChunkSize()
        filename_data_subgroup = dir_data + 'data_patrec_' + dataset + '_' + subgroup + '_clean.csv'

        subgroup_data_reader = pd.read_csv(filename_data_subgroup,
                                           chunksize=chunksize)
        for k, chunk in enumerate(subgroup_data_reader):
            chunk = chunk.drop(self.dataset_options.getEventColumnName(),
                               axis=1)
            columns = list(chunk.columns)
            sum_chunk = chunk.sum(axis=0)
            if k == 0:
                sum_subgroup = pd.DataFrame(data=np.zeros((1, len(columns))),
                                            columns=columns)
            sum_subgroup = sum_subgroup.add(sum_chunk)

        num_columns = int(sum_subgroup.astype(bool).sum(axis=1).values)
        print(subgroup + ' --> number of columns: ' + str(len(columns)))
        print(subgroup + ' --> number of non-zero columns: ' +
              str(num_columns))

    def _getAvgNumSubgroupPatrec(self, subgroup):
        dir_data = self.dataset_options.getDirData()
        dataset = self.dataset_options.getDatasetName()
        name_demographic_features = self.dataset_options.getFilenameOptionDemographicFeatures(
        )
        encoding = self.dataset_options.getEncodingScheme()
        feature_set_str = self.dataset_options.getFeatureSetStr()
        filename_data_subgroup = dir_data + 'data_patrec_' + dataset + '_' + name_demographic_features + '_' + feature_set_str + '_' + encoding + '.csv'
        df = pd.read_csv(filename_data_subgroup)

        df_num_subgroup = df['num' + subgroup]
        avg_num = np.mean(df_num_subgroup.values)
        return avg_num

    def _getAvgNumSubgroupNZ(self):
        dir_data = self.dataset_options.getDirData()
        dataset = self.dataset_options.getDatasetName()
        name_demographic_features = self.dataset_options.getFilenameOptionDemographicFeatures(
        )
        grouping = self.dataset_options.getGroupingName()
        encoding = self.dataset_options.getEncodingScheme()
        feature_set_str = self.dataset_options.getFeatureSetStr()
        filename_data_subgroup = dir_data + 'data_nz_' + dataset + '_' + feature_set_str + '_' + encoding + '_' + grouping + '.csv'
        df = pd.read_csv(filename_data_subgroup)

        df_num_subgroup = df['diag_DIAG_COUNT']
        avg_num = np.mean(df_num_subgroup.values)
        return avg_num

    def _getNumberColumnsSubgroupNZ(self, subgroup):
        dir_data = self.dataset_options.getDirData()
        dataset = self.dataset_options.getDatasetName()
        chunksize = self.dataset_options.getChunkSize()
        filename_data_subgroup = dir_data + 'data_nz_' + dataset + '_' + subgroup + '_clean.csv'

    def _getNumberHauptdiagnosePatrec(self):
        dir_data = self.dataset_options.getDirData()
        dataset = self.dataset_options.getDatasetName()
        filename_data = dir_data + 'data_patrec_' + dataset + '_REST_clean.csv'
        df = pd.read_csv(filename_data)
        diff_values_hauptdiagnose = list(set(df['Hauptdiagnose'].values))
        print('Hauptdiagnose --> number of values: ' +
              str(len(diff_values_hauptdiagnose)))

    def _getNumberHauptdiagnoseNZ(self):
        dir_data = self.dataset_options.getDirData()
        dataset = self.dataset_options.getDatasetName()
        filename_data = dir_data + 'data_nz_' + dataset + '_discharge.csv'
        df = pd.read_csv(filename_data)
        diff_values_hauptdiagnose = list(set(df['main_diag'].values))
        print('Hauptdiagnose --> number of values: ' +
              str(len(diff_values_hauptdiagnose)))

    def getNumberColumnsSubgroup(self, subgroup):
        data_prefix = self.dataset_options.getDataPrefix()
        if data_prefix == 'patrec':
            self._getNumberColumnsSubgroupPatrec(subgroup)
        elif data_prefix == 'nz':
            pass
        else:
            print('data prefix is unknown...exit')
            sys.exit()

    def getNumberHauptdiagnose(self):
        data_prefix = self.dataset_options.getDataPrefix()
        if data_prefix == 'patrec':
            self._getNumberHauptdiagnosePatrec()
        elif data_prefix == 'nz':
            self._getNumberHauptdiagnoseNZ()
        else:
            print('data prefix is unknown...exit')
            sys.exit()

    def getAvgNumberSubgroup(self, subgroup):
        data_prefix = self.dataset_options.getDataPrefix()
        if data_prefix == 'patrec':
            avg_num = self._getAvgNumSubgroupPatrec(subgroup)
            return avg_num
        elif data_prefix == 'nz':
            if not subgroup == 'DK':
                print('only implemented for diagnoses...exit')
                sys.exit()
            avg_num = self._getAvgNumSubgroupNZ()
            return avg_num
        else:
            print('unknown data prefix..exit')
            sys.exit()
parser.add_argument('--data_dir', type=str, default='./data')
parser.add_argument('--save_dir', type=str, default='./saves')
parser.add_argument('--conf_dir', type=str, default='./conf')
parser.add_argument('--seed', type=int, default=225)

conf = parser.parse_args()
model_conf = Params(os.path.join(conf.conf_dir, conf.model.lower() + '.json'))

np.random.seed(conf.seed)
torch.random.manual_seed(conf.seed)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device(
    'cpu')

dataset = Dataset(data_dir=conf.data_dir,
                  data_name=model_conf.data_name,
                  train_ratio=model_conf.train_ratio,
                  device=device)

log_dir = os.path.join('saves', conf.model)
logger = Logger(log_dir)
model_conf.save(os.path.join(logger.log_dir, 'config.json'))

eval_pos, eval_target = dataset.eval_data()
item_popularity = dataset.item_popularity
evaluator = Evaluator(eval_pos, eval_target, item_popularity, model_conf.top_k)

model_base = getattr(models, conf.model)
model = model_base(model_conf, dataset.num_users, dataset.num_items, device)

logger.info(model_conf)
logger.info(dataset)
Exemple #28
0
import numpy as np
from utils.Dataset import Dataset
from model import model_multi_view
from utils.cluster import cluster
import csv

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
print(os.environ['CUDA_VISIBLE_DEVICES'])
'''
each net has its own learning_rate(lr_xx), activation_function(act_xx), nodes_of_layers(dims_xx)
ae net need pretraining before the whole optimizatoin
'''
if __name__ == '__main__':

    num = 10
    data = Dataset('COIL20_3views')
    X, gt = data.load_data()

    acc_H_all = np.zeros(num)
    nmi_H_all = np.zeros(num)
    RI_H_all = np.zeros(num)
    f1_H_all = np.zeros(num)

    para_lambda = 1
    batch_size = X['0'].shape[0]
    lr_pre = 1.0e-3
    lr_ae = 1.0e-3
    lr_dg = 1.0e-3
    lr_h = 1.0e-2
    epochs_pre = 300
    epochs_total = 100
Exemple #29
0
import tensorflow as tf
import numpy as np
import scipy.io as scio
from utils.Net_ae import Net_ae
from utils.Net_dg import Net_dg
from utils.next_batch import next_batch
import math
from sklearn.utils import shuffle
import timeit
from keras.layers import *
from utils.print_result import print_result
from keras.models import Model
from utils.Dataset import Dataset
data = Dataset('handwritten_2views')
x1, x2, gt = data.load_data()
x1 = data.normalize(x1, 0)
x2 = data.normalize(x2, 0)
n_clusters = len(set(gt))

def xavier_init(fan_in, fan_out, constant=1):
    low = -constant * np.sqrt(6.0 / (fan_in + fan_out))
    high = constant * np.sqrt(6.0 / (fan_in + fan_out))
    return tf.random_uniform((fan_in, fan_out),
                            minval=low, maxval=high,
                            dtype=tf.float32)
class dualModel:
    def __init__(self,epochs):
        self.epochs=epochs
    def train_model(self,X1, X2, gt, para_lambda, dims, act, lr, epochs, batch_size):
        err_total = list()
        start = timeit.default_timer()
    for year in years:
        print('year: ' + str(year))
        dict_options_dataset = {
            'dir_data': dirData,
            'data_prefix': 'nz',
            'dataset': str(year),
            'encoding': 'embedding',
            'grouping': 'verylightgrouping',
            'newfeatures': None,
            'featurereduction': {
                'method': 'FUSION'
            }
        }

        options_dataset_year = DatasetOptions(dict_options_dataset)
        dataset_year = Dataset(options_dataset_year)
        if balanced:
            df_year = dataset_year.getBalancedSubSet()
        else:
            df_year = dataset_year.getDf()

        #df_year['main_diag'] = df_year['main_diag'].apply(convertDiagToInd)
        print(df_year.shape)
        df_all_years = df_all_years.append(df_year)

    print('df balanced all years: ' + str(df_all_years.shape))

    encoding = options_dataset_year.getEncodingScheme()
    grouping = options_dataset_year.getGroupingName()
    featureset = options_dataset_year.getFeatureSetStr()
    filename_data_years = dirData + 'data_nz_' + str(min(years)) + str(