Exemple #1
0
 def __init__(self, data_name, target_class, under_sampling=None):
     dataset = MyDataSet(data_name, target_class=target_class, encode=True)
     self.datalist_encoded = [d for d in dataset.datalist]
     self.datalist = [dataset.decode(d) for d in self.datalist_encoded]
     X, y = get_X_y(self.datalist)
     distance, self.itneighbors = KNeighborsClassifier().fit(
         X, y).kneighbors(self.datalist)
     self.dataset = dataset
     self.attrtype_dict = dataset.attrtype_dict
     self.attrlists = [d.attrlist for d in self.datalist]
     self.classlists = [d.dataclass for d in self.datalist]
     self.data_class = target_class
     self.under_sampling = under_sampling
Exemple #2
0
def do_classify(data_name='kdd99_binary_test.dat'):
    print('------------------------------------')

    dataset = MyDataSet(data_name)
    class_dict = dataset.class_dict

    print(data_name)
    print(class_dict)

    reversed_class_dict = dict(zip(class_dict.values(), class_dict.keys()))
    len_list = list(reversed_class_dict.keys())
    len_list.sort()

    positive_len, negative_len = max(len_list), min(len_list)
    positive_class, negative_class = reversed_class_dict[
        positive_len], reversed_class_dict[negative_len]

    discrete_num_map = [{k: 0} for k in dataset.data_list_discrete[0]]
    for data in dataset.data_list_discrete:
        for attr, attr_map in zip(data, discrete_num_map):
            if attr not in attr_map:
                attr_map[attr] = list(attr_map.values())[-1] + 1

    # 二分类
    print('{} vs {}'.format(positive_class, negative_class))
    binary_classify(data_train=copy.deepcopy(dataset.data_list_total),
                    positive=positive_class,
                    negative=negative_class,
                    positive_len=positive_len,
                    negative_len=negative_len,
                    expend=True,
                    data_name=data_name,
                    discrete_num_map=discrete_num_map)
Exemple #3
0
    def __init__(self,
                 dataset: MyDataSet,
                 output_data_label,
                 output_data_size,
                 batch_size,
                 learning_rate,
                 module_features,
                 log=False):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.output_data_label = output_data_label
        self.output_data_size = output_data_size

        self.dataset = dataset
        self.dataloader_train = DataLoader(dataset=dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           drop_last=False)

        self.net = NetVAE(module_features=module_features)
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=learning_rate)
        # self.optimizer = torch.optim.SGD(self.net.parameters(), lr=learning_rate)

        self.log = log
        if log:
            self.dir_path = 'KDD99_FAKE_{}'.format(dataset.data_name, )
            self.file_path = '/from_{}_gen_{}_label={}_'.format(
                dataset.__len__(), output_data_size, self.output_data_label)
            if not os.path.exists(self.dir_path):
                os.mkdir(self.dir_path)
            self.loss_log = open(
                self.dir_path + self.file_path + 'loss_log.txt', 'w')
        self.output_data_list = []
        self._output_data = []
Exemple #4
0
def gen_with_multi_vae(target_class, target_num, data_name):
    dataset = MyDataSet(data_name, target_class=target_class, encode=True)
    module_features = (dataset.single_continuous_data_len, 30, 20, 16)
    lr = 0.00088
    batch_size = 100

    trainer = Trainer(module_features=module_features, learning_rate=lr,
                      batch_size=batch_size,
                      dataset=dataset, output_data_label=target_class, output_data_size=target_num // 2)(100)

    temp = trainer.output_data
    print(temp[0].attr_list)
    dataset = MyDataSet(temp, target_class=target_class, encode=True)
    trainer = Trainer(module_features=module_features, learning_rate=lr,
                      batch_size=batch_size,
                      dataset=dataset, output_data_label=target_class, output_data_size=target_num)(100)

    return [data.to_list(DataType.CONTINUOUS) for data in trainer.output_data]
Exemple #5
0
def gen_with_vae(target_class, target_num, data_name):
    learning_rate = 0.000921

    dataset = MyDataSet(data_name, target_class=target_class, encode=True)
    trainer = Trainer(module_features=(dataset.single_continuous_data_len, 30, 20, 16), learning_rate=learning_rate,
                      batch_size=64,
                      dataset=dataset, output_data_label=target_class, output_data_size=target_num)
    trainer(80)
    return trainer.output_data
Exemple #6
0
    def __init__(self,
                 data_name,
                 target_class,
                 target_num,
                 module_features,
                 learning_rate,
                 batch_size,
                 log=False):
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.target_class = target_class
        self.target_num = target_num

        dataset = MyDataSet(data_name, target_class=target_class)
        for d in dataset.datalist:
            print(d.attrlist)
        self.dataset = dataset

        # print(dataset.data_max)
        # print(dataset.data_min)

        self.dataloader_train = DataLoader(dataset=dataset,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           drop_last=False)

        self.net = NetVAE(module_features=[dataset.datalist[0].__len__()] +
                          module_features)
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=learning_rate)

        self.targetdata_list = []

        self.log = log
        if log:
            self.dir_path = 'KDD99_FAKE_{}'.format(dataset.dataname, )
            self.file_path = '/from_{}_gen_{}_label={}_'.format(
                dataset.__len__(), target_num, target_class)
            if not os.path.exists(self.dir_path):
                os.mkdir(self.dir_path)
            self.loss_log = open(
                self.dir_path + self.file_path + 'loss_log.txt', 'w')
Exemple #7
0
def do_classify(data_name='new_data.dat', binary_class=True, using_kdd99=False):
    # TODO print
    print('------------------------------------')

    dataset = MyDataSet(data_name)
    class_dict = dataset.class_dict

    # TODO print
    print(data_name)
    print(class_dict)

    pos_len = 0
    positive = ''
    for k, v in class_dict.items():
        if v > pos_len:
            pos_len = v
            positive = k

    class_dict.pop(positive)
    negatives = list(class_dict.keys())
    negs_len = list(class_dict.values())

    data_map = [{k: 0} for k in dataset.data_list_discrete[0]]
    for data in dataset.data_list_discrete:
        new_data = []
        for attr, attr_map in zip(data, data_map):
            if attr not in attr_map:
                attr_map[attr] = list(attr_map.values())[-1] + 1
            new_data.append(attr_map[attr])

    # 二分类
    if binary_class:
        for negative, neg_len in zip(negatives, negs_len):
            print('{} vs {}'.format(positive, negative))
            data_train_total = copy.deepcopy(dataset.data_list_total)
            binary_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive,
                            negative=negative, pos_len=pos_len, neg_len=neg_len, expend=False, using_kdd99=using_kdd99,
                            data_name=data_name,
                            data_map=data_map, vae_only=False)
            binary_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive,
                            negative=negative, pos_len=pos_len, neg_len=neg_len, expend=True, using_kdd99=using_kdd99,
                            data_name=data_name,
                            data_map=data_map, vae_only=False)
    # 多分类
    if not binary_class:
        multi_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive,
                       negatives=negatives, pos_len=pos_len, negs_len=negs_len, expend=False, using_kdd99=using_kdd99,
                       data_name=data_name, data_map=data_map)
        multi_classify(data_train_total=copy.deepcopy(dataset.data_list_total), positive=positive,
                       negatives=negatives, pos_len=pos_len, negs_len=negs_len, expend=True, using_kdd99=using_kdd99,
                       data_name=data_name, data_map=data_map)
Exemple #8
0
 def __init__(self, data_name, target_class, data_map):
     dataset = MyDataSet(data_name, encode=False, target_class=target_class)
     self.__dataset = dataset
     self.__target_class = target_class
     data_list = dataset.data_list_discrete
     data_num_list = []
     for data in data_list:
         new_data = []
         for attr, attr_map in zip(data, data_map):
             if attr not in attr_map:
                 attr_map[attr] = list(attr_map.values())[-1] + 1
             new_data.append(attr_map[attr])
         data_num_list.append(new_data)
     self.__data_num_list = data_num_list
     self.__data_map = data_map
def do_classify(data_name):
    # TODO print
    print('------------------------------------')

    dataset = MyDataSet(data_name)
    class_dict = dataset.class_dict

    # TODO print
    print(data_name)
    print(class_dict)

    reversed_class_dict = dict(zip(class_dict.values(), class_dict.keys()))
    len_list = list(reversed_class_dict.keys())
    len_list.sort()

    positive_len = max(len_list)
    negative_lens = [l for l in len_list if l != positive_len]
    positive = reversed_class_dict[positive_len]
    negatives = [reversed_class_dict[l] for l in len_list if l != positive_len]

    discrete_num_map = [{k: 0} for k in dataset.data_list_discrete[0]]
    for data in dataset.data_list_discrete:
        for attr, attr_map in zip(data, discrete_num_map):
            if attr not in attr_map:
                attr_map[attr] = list(attr_map.values())[-1] + 1

    # 多分类
    multi_classify(data_train=copy.deepcopy(dataset.data_list_total),
                   positive=positive,
                   negatives=negatives,
                   positive_len=positive_len,
                   negative_lens=negative_lens,
                   expend=False,
                   data_name=data_name,
                   discrete_num_map=discrete_num_map)
    multi_classify(data_train=copy.deepcopy(dataset.data_list_total),
                   positive=positive,
                   negatives=negatives,
                   positive_len=positive_len,
                   negative_lens=negative_lens,
                   expend=True,
                   data_name=data_name,
                   discrete_num_map=discrete_num_map)
Exemple #10
0
    if plot:
        plt.plot(x, y, color=color)


def normfun(x, mu, sigma):
    pdf = np.exp(-((x - mu)**2) /
                 (2 * sigma**2)) / (sigma * np.sqrt(2 * np.pi))
    return pdf


if __name__ == '__main__':
    dataname = 'ecoli4.dat'
    print(dataname)
    # dataname = 'kdd99_new_multi.dat'

    dataset = MyDataSet(dataname, encode=True)

    reversed_class_dict = dict(
        zip(dataset.dataclass_dict.values(), dataset.dataclass_dict.keys()))
    len_list = list(reversed_class_dict.keys())
    len_list.sort()

    positive_len, negative_len = len_list[1], len_list[0]
    positive_class, negative_class = reversed_class_dict[
        positive_len], reversed_class_dict[negative_len]
    negative_len = positive_len - negative_len

    dataset_negative = MyDataSet(dataname,
                                 target_class=negative_class,
                                 encode=True)
Exemple #11
0
    def loss_function(input_tensor, target_tensor, mean,
                      log_var) -> (Tensor, Tensor):
        reconstruction_loss = torch.nn.CosineSimilarity()(input_tensor,
                                                          target_tensor).sum()
        # reconstruction_loss = torch.nn.BCELoss(reduction='sum')(input_tensor, target_tensor)
        kl_divergence = -0.5 * torch.sum(1 + log_var - torch.exp(log_var) -
                                         mean**2)

        return reconstruction_loss, kl_divergence


if __name__ == '__main__':
    target_class = 'land'
    data_name = 'kdd99_{}.kdd99'.format(target_class)

    dataset = MyDataSet(data_name, target_class=target_class, encode=True)
    trainer = Trainer(module_features=(dataset.single_continuous_data_len, 30,
                                       20, 16),
                      learning_rate=0.000918,
                      batch_size=100,
                      dataset=dataset,
                      output_data_label=target_class,
                      output_data_size=40)
    trainer(50)

    # Todo print
    print(trainer.output_data)

    dataset = MyDataSet(trainer.output_data,
                        target_class=target_class,
                        encode=True)