def __init__(self, data_name, target_class, under_sampling=None): dataset = MyDataSet(data_name, target_class=target_class, encode=True) self.datalist_encoded = [d for d in dataset.datalist] self.datalist = [dataset.decode(d) for d in self.datalist_encoded] X, y = get_X_y(self.datalist) distance, self.itneighbors = KNeighborsClassifier().fit( X, y).kneighbors(self.datalist) self.dataset = dataset self.attrtype_dict = dataset.attrtype_dict self.attrlists = [d.attrlist for d in self.datalist] self.classlists = [d.dataclass for d in self.datalist] self.data_class = target_class self.under_sampling = under_sampling
def do_classify(data_name='kdd99_binary_test.dat'): print('------------------------------------') dataset = MyDataSet(data_name) class_dict = dataset.class_dict print(data_name) print(class_dict) reversed_class_dict = dict(zip(class_dict.values(), class_dict.keys())) len_list = list(reversed_class_dict.keys()) len_list.sort() positive_len, negative_len = max(len_list), min(len_list) positive_class, negative_class = reversed_class_dict[ positive_len], reversed_class_dict[negative_len] discrete_num_map = [{k: 0} for k in dataset.data_list_discrete[0]] for data in dataset.data_list_discrete: for attr, attr_map in zip(data, discrete_num_map): if attr not in attr_map: attr_map[attr] = list(attr_map.values())[-1] + 1 # 二分类 print('{} vs {}'.format(positive_class, negative_class)) binary_classify(data_train=copy.deepcopy(dataset.data_list_total), positive=positive_class, negative=negative_class, positive_len=positive_len, negative_len=negative_len, expend=True, data_name=data_name, discrete_num_map=discrete_num_map)
def __init__(self, dataset: MyDataSet, output_data_label, output_data_size, batch_size, learning_rate, module_features, log=False): self.learning_rate = learning_rate self.batch_size = batch_size self.output_data_label = output_data_label self.output_data_size = output_data_size self.dataset = dataset self.dataloader_train = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=False) self.net = NetVAE(module_features=module_features) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) # self.optimizer = torch.optim.SGD(self.net.parameters(), lr=learning_rate) self.log = log if log: self.dir_path = 'KDD99_FAKE_{}'.format(dataset.data_name, ) self.file_path = '/from_{}_gen_{}_label={}_'.format( dataset.__len__(), output_data_size, self.output_data_label) if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) self.loss_log = open( self.dir_path + self.file_path + 'loss_log.txt', 'w') self.output_data_list = [] self._output_data = []
def gen_with_multi_vae(target_class, target_num, data_name): dataset = MyDataSet(data_name, target_class=target_class, encode=True) module_features = (dataset.single_continuous_data_len, 30, 20, 16) lr = 0.00088 batch_size = 100 trainer = Trainer(module_features=module_features, learning_rate=lr, batch_size=batch_size, dataset=dataset, output_data_label=target_class, output_data_size=target_num // 2)(100) temp = trainer.output_data print(temp[0].attr_list) dataset = MyDataSet(temp, target_class=target_class, encode=True) trainer = Trainer(module_features=module_features, learning_rate=lr, batch_size=batch_size, dataset=dataset, output_data_label=target_class, output_data_size=target_num)(100) return [data.to_list(DataType.CONTINUOUS) for data in trainer.output_data]
def gen_with_vae(target_class, target_num, data_name): learning_rate = 0.000921 dataset = MyDataSet(data_name, target_class=target_class, encode=True) trainer = Trainer(module_features=(dataset.single_continuous_data_len, 30, 20, 16), learning_rate=learning_rate, batch_size=64, dataset=dataset, output_data_label=target_class, output_data_size=target_num) trainer(80) return trainer.output_data
def __init__(self, data_name, target_class, target_num, module_features, learning_rate, batch_size, log=False): self.learning_rate = learning_rate self.batch_size = batch_size self.target_class = target_class self.target_num = target_num dataset = MyDataSet(data_name, target_class=target_class) for d in dataset.datalist: print(d.attrlist) self.dataset = dataset # print(dataset.data_max) # print(dataset.data_min) self.dataloader_train = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, drop_last=False) self.net = NetVAE(module_features=[dataset.datalist[0].__len__()] + module_features) self.optimizer = torch.optim.Adam(self.net.parameters(), lr=learning_rate) self.targetdata_list = [] self.log = log if log: self.dir_path = 'KDD99_FAKE_{}'.format(dataset.dataname, ) self.file_path = '/from_{}_gen_{}_label={}_'.format( dataset.__len__(), target_num, target_class) if not os.path.exists(self.dir_path): os.mkdir(self.dir_path) self.loss_log = open( self.dir_path + self.file_path + 'loss_log.txt', 'w')
def do_classify(data_name='new_data.dat', binary_class=True, using_kdd99=False): # TODO print print('------------------------------------') dataset = MyDataSet(data_name) class_dict = dataset.class_dict # TODO print print(data_name) print(class_dict) pos_len = 0 positive = '' for k, v in class_dict.items(): if v > pos_len: pos_len = v positive = k class_dict.pop(positive) negatives = list(class_dict.keys()) negs_len = list(class_dict.values()) data_map = [{k: 0} for k in dataset.data_list_discrete[0]] for data in dataset.data_list_discrete: new_data = [] for attr, attr_map in zip(data, data_map): if attr not in attr_map: attr_map[attr] = list(attr_map.values())[-1] + 1 new_data.append(attr_map[attr]) # 二分类 if binary_class: for negative, neg_len in zip(negatives, negs_len): print('{} vs {}'.format(positive, negative)) data_train_total = copy.deepcopy(dataset.data_list_total) binary_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive, negative=negative, pos_len=pos_len, neg_len=neg_len, expend=False, using_kdd99=using_kdd99, data_name=data_name, data_map=data_map, vae_only=False) binary_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive, negative=negative, pos_len=pos_len, neg_len=neg_len, expend=True, using_kdd99=using_kdd99, data_name=data_name, data_map=data_map, vae_only=False) # 多分类 if not binary_class: multi_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive, negatives=negatives, pos_len=pos_len, negs_len=negs_len, expend=False, using_kdd99=using_kdd99, data_name=data_name, data_map=data_map) multi_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive, negatives=negatives, pos_len=pos_len, negs_len=negs_len, expend=True, using_kdd99=using_kdd99, data_name=data_name, data_map=data_map)
def __init__(self, data_name, target_class, data_map): dataset = MyDataSet(data_name, encode=False, target_class=target_class) self.__dataset = dataset self.__target_class = target_class data_list = dataset.data_list_discrete data_num_list = [] for data in data_list: new_data = [] for attr, attr_map in zip(data, data_map): if attr not in attr_map: attr_map[attr] = list(attr_map.values())[-1] + 1 new_data.append(attr_map[attr]) data_num_list.append(new_data) self.__data_num_list = data_num_list self.__data_map = data_map
def do_classify(data_name): # TODO print print('------------------------------------') dataset = MyDataSet(data_name) class_dict = dataset.class_dict # TODO print print(data_name) print(class_dict) reversed_class_dict = dict(zip(class_dict.values(), class_dict.keys())) len_list = list(reversed_class_dict.keys()) len_list.sort() positive_len = max(len_list) negative_lens = [l for l in len_list if l != positive_len] positive = reversed_class_dict[positive_len] negatives = [reversed_class_dict[l] for l in len_list if l != positive_len] discrete_num_map = [{k: 0} for k in dataset.data_list_discrete[0]] for data in dataset.data_list_discrete: for attr, attr_map in zip(data, discrete_num_map): if attr not in attr_map: attr_map[attr] = list(attr_map.values())[-1] + 1 # 多分类 multi_classify(data_train=copy.deepcopy(dataset.data_list_total), positive=positive, negatives=negatives, positive_len=positive_len, negative_lens=negative_lens, expend=False, data_name=data_name, discrete_num_map=discrete_num_map) multi_classify(data_train=copy.deepcopy(dataset.data_list_total), positive=positive, negatives=negatives, positive_len=positive_len, negative_lens=negative_lens, expend=True, data_name=data_name, discrete_num_map=discrete_num_map)
if plot: plt.plot(x, y, color=color) def normfun(x, mu, sigma): pdf = np.exp(-((x - mu)**2) / (2 * sigma**2)) / (sigma * np.sqrt(2 * np.pi)) return pdf if __name__ == '__main__': dataname = 'ecoli4.dat' print(dataname) # dataname = 'kdd99_new_multi.dat' dataset = MyDataSet(dataname, encode=True) reversed_class_dict = dict( zip(dataset.dataclass_dict.values(), dataset.dataclass_dict.keys())) len_list = list(reversed_class_dict.keys()) len_list.sort() positive_len, negative_len = len_list[1], len_list[0] positive_class, negative_class = reversed_class_dict[ positive_len], reversed_class_dict[negative_len] negative_len = positive_len - negative_len dataset_negative = MyDataSet(dataname, target_class=negative_class, encode=True)
def loss_function(input_tensor, target_tensor, mean, log_var) -> (Tensor, Tensor): reconstruction_loss = torch.nn.CosineSimilarity()(input_tensor, target_tensor).sum() # reconstruction_loss = torch.nn.BCELoss(reduction='sum')(input_tensor, target_tensor) kl_divergence = -0.5 * torch.sum(1 + log_var - torch.exp(log_var) - mean**2) return reconstruction_loss, kl_divergence if __name__ == '__main__': target_class = 'land' data_name = 'kdd99_{}.kdd99'.format(target_class) dataset = MyDataSet(data_name, target_class=target_class, encode=True) trainer = Trainer(module_features=(dataset.single_continuous_data_len, 30, 20, 16), learning_rate=0.000918, batch_size=100, dataset=dataset, output_data_label=target_class, output_data_size=40) trainer(50) # Todo print print(trainer.output_data) dataset = MyDataSet(trainer.output_data, target_class=target_class, encode=True)