Beispiel #1
0
def prepare_madelon(nb_devices: int,
                    data_path: str,
                    pickle_path: str,
                    iid: bool = True,
                    double_check: bool = False):

    raw_data = pd.read_csv('{0}/dataset/madelon/madelon_train.data'.format(
        get_path_to_datasets()),
                           sep=" ",
                           header=None)
    raw_data.drop(raw_data.columns[len(raw_data.columns) - 1],
                  axis=1,
                  inplace=True)
    scaled_X = scale(np.array(raw_data, dtype=np.float64))
    scaled_data = pd.DataFrame(data=scaled_X)
    dim = len(scaled_data.columns)

    Y_data = pd.read_csv('{0}/dataset/madelon/madelon_train.labels'.format(
        get_path_to_datasets()),
                         header=None)

    scaled_data["target"] = Y_data.values

    if iid:
        X_tensor = torch.tensor(scaled_X, dtype=torch.float64)
        Y_tensor = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_tensor, Y_tensor, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "target",
                                      data_path + "/madelon", pickle_path,
                                      nb_devices, double_check)
    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #2
0
def prepare_w8a(nb_devices: int,
                data_path: str,
                pickle_path: str,
                iid: bool = True,
                double_check: bool = False,
                test: bool = False):

    if not test:
        raw_X, raw_Y = load_svmlight_file("{0}/dataset/w8a/w8a".format(
            get_path_to_datasets()))
        raw_X = raw_X.todense()
    else:
        raw_X, raw_Y = load_svmlight_file("{0}/dataset/w8a/w8a.t".format(
            get_path_to_datasets()))
        raw_X = raw_X.todense()
        raw_X = np.c_[raw_X, np.zeros((len(raw_Y)))]

    scaled_X = scale(np.array(raw_X, dtype=np.float64))
    scaled_data = pd.DataFrame(data=scaled_X)
    scaled_data["target"] = raw_Y
    dim = len(scaled_data.columns) - 1

    Y_data = scaled_data.loc[:, scaled_data.columns == "target"]

    if iid:
        X_tensor = torch.tensor(scaled_X, dtype=torch.float64)
        Y_tensor = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_tensor, Y_tensor, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "target",
                                      data_path + "/w8a", pickle_path,
                                      nb_devices, double_check)
    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #3
0
def prepare_superconduct(nb_devices: int,
                         data_path: str,
                         pickle_path: str,
                         iid: bool = True,
                         double_check: bool = False):
    raw_data = pd.read_csv('{0}/dataset/superconduct/train.csv'.format(
        get_path_to_datasets()),
                           sep=",")
    if raw_data.isnull().values.any():
        logging.warning("There is missing value.")
    else:
        logging.debug("No missing value. Great !")
    logging.debug("Scaling data.")
    scaled_data = scale(raw_data)

    scaled_data = pd.DataFrame(data=scaled_data, columns=raw_data.columns)
    X_data = scaled_data.loc[:, scaled_data.columns != "critical_temp"]
    Y_data = scaled_data.loc[:, scaled_data.columns == "critical_temp"]
    dim = len(X_data.columns)
    logging.debug("There is " + str(dim) + " dimensions.")

    logging.debug("Head of the dataset:")
    logging.debug(raw_data.head())

    if iid:
        X_tensor = torch.tensor(X_data.to_numpy(), dtype=torch.float64)
        Y_tensor = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_tensor, Y_tensor, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "critical_temp",
                                      data_path + "/superconduct", pickle_path,
                                      nb_devices, double_check)
    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #4
0
def prepare_phishing(nb_devices: int,
                     data_path: str,
                     pickle_path: str,
                     iid: bool = True,
                     double_check: bool = False):

    raw_X, raw_Y = load_svmlight_file(
        "{0}/dataset/phishing/phishing.txt".format(get_path_to_datasets()))

    for i in range(len(raw_Y)):
        if raw_Y[i] == 0:
            raw_Y[i] = -1

    scaled_X = scale(np.array(raw_X.todense(), dtype=np.float64))
    scaled_data = pd.DataFrame(data=scaled_X)
    scaled_data["target"] = raw_Y
    dim = len(scaled_data.columns) - 1

    Y_data = scaled_data.loc[:, scaled_data.columns == "target"]

    if iid:
        X_tensor = torch.tensor(scaled_X, dtype=torch.float64)
        Y_tensor = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_tensor, Y_tensor, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "target",
                                      data_path + "/phishing", pickle_path,
                                      nb_devices, double_check)
    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #5
0
def prepare_abalone(nb_devices: int,
                    data_path: str,
                    pickle_path: str,
                    iid: bool = True,
                    double_check: bool = False):
    raw_data = pd.read_csv('{0}/dataset/abalone/abalone.csv'.format(
        get_path_to_datasets()),
                           sep=",",
                           header=None)

    raw_data = raw_data.rename(columns={
        0: "gender",
        1: "Length",
        2: "Diameter",
        3: "Height",
        8: "rings"
    })
    labelencoder = LabelEncoder()
    raw_data["gender"] = labelencoder.fit_transform(raw_data["gender"])

    Y_data = raw_data.loc[:, raw_data.columns == "rings"]

    scaled_data = scale(raw_data.loc[:, raw_data.columns != "rings"])
    scaled_X = pd.DataFrame(
        data=scaled_data,
        columns=raw_data.loc[:, raw_data.columns != "rings"].columns)

    # Merging dataset in one :
    scaled_data = pd.concat([scaled_X, Y_data], axis=1, sort=False)
    dim = len(scaled_X.columns)

    if iid:
        X_merged = torch.tensor(scaled_X.to_numpy(), dtype=torch.float64)
        Y_merged = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_merged, Y_merged, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "rings",
                                      data_path + "/abalone", pickle_path,
                                      nb_devices, double_check)
    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #6
0
def prepare_mushroom(nb_devices: int,
                     data_path: str,
                     pickle_path: str,
                     iid: bool = True,
                     double_check: bool = False):
    raw_data = pd.read_csv('{0}/dataset/mushroom/mushrooms.csv'.format(
        get_path_to_datasets()))

    # The data is categorial so I convert it with LabelEncoder to transfer to ordinal.
    labelencoder = LabelEncoder()
    for column in raw_data.columns:
        raw_data[column] = labelencoder.fit_transform(raw_data[column])

    # It can be seen that the column "veil-type" is 0 and not contributing to the data so I remove it.
    raw_data = raw_data.drop(["veil-type"], axis=1)
    raw_data = raw_data.replace({'class': {0: -1}})

    Y_data = raw_data.loc[:, raw_data.columns == "class"]

    scaled_data = scale(raw_data.loc[:, raw_data.columns != "class"])
    scaled_X = pd.DataFrame(
        data=scaled_data,
        columns=raw_data.loc[:, raw_data.columns != "class"].columns)

    # Merging dataset in one :
    scaled_data = pd.concat([scaled_X, Y_data], axis=1, sort=False)
    dim = len(scaled_X.columns)

    if iid:
        X_merged = torch.tensor(scaled_X.to_numpy(), dtype=torch.float64)
        Y_merged = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_merged, Y_merged, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "class",
                                      data_path + "/mushroom", pickle_path,
                                      nb_devices, double_check)
    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #7
0
def prepare_quantum(nb_devices: int,
                    data_path: str,
                    pickle_path: str,
                    iid: bool = True,
                    double_check: bool = False):

    raw_data = pd.read_csv('{0}/dataset/quantum/phy_train.csv'.format(
        get_path_to_datasets()),
                           sep="\t",
                           header=None)

    # Looking for missing values.
    columns_with_missing_values = []
    for col in range(1, len(raw_data.columns)):
        if (not raw_data[raw_data[col] == 999].empty) or (
                not raw_data[raw_data[col] == 9999].empty):
            columns_with_missing_values.append(col)
    logging.debug("Following columns has missing values:",
                  columns_with_missing_values)
    raw_data.drop(raw_data.columns[columns_with_missing_values],
                  axis=1,
                  inplace=True)
    logging.debug("The columns with empty values have been removed.")
    raw_data = raw_data.rename(columns={0: "ID", 1: "state", 80: "nothing"})
    raw_data = raw_data.drop(['ID', 'nothing'], axis=1)
    raw_data.head()

    # Looking for empty columns (with null std).
    small_std = []
    std_data = raw_data.std()
    for i in range(len(raw_data.columns)):
        if std_data.iloc[i] < 1e-5:
            small_std.append(i)
    logging.debug("This columns are empty: {0}".format(small_std))
    raw_data.iloc[:, small_std].describe()

    # Removing columns with null std
    raw_data = raw_data.loc[:, (raw_data.std() > 1e-6)]
    dim = len(raw_data.columns) - 1  # The dataset still contains the label
    logging.debug("Now, there is " + str(dim) + " dimensions.")

    raw_data = raw_data.replace({'state': {0: -1}})

    logging.debug("Head of the dataset (columns has not been re-indexed).")
    logging.debug(raw_data.head())

    logging.debug("Labels repartition:")
    logging.debug(raw_data['state'].value_counts())

    X_data = raw_data.loc[:, raw_data.columns != "state"]
    Y_data = raw_data.loc[:, raw_data.columns ==
                          "state"]  # We do not scale labels (+/-1).

    logging.debug("Scaling data.")
    scaled_data = scale(raw_data.loc[:, raw_data.columns != "state"])

    scaled_X = pd.DataFrame(
        data=scaled_data,
        columns=raw_data.loc[:, raw_data.columns != "state"].columns)

    # Merging dataset in one :
    scaled_data = pd.concat([scaled_X, Y_data], axis=1, sort=False)

    if iid:
        # Transforming into torch.FloatTensor
        X_tensor = torch.tensor(scaled_X.to_numpy(), dtype=torch.float64)
        Y_tensor = torch.tensor(Y_data.values, dtype=torch.float64)
        X, Y = prepare_dataset_by_device(X_tensor, Y_tensor, nb_devices)
    else:
        X, Y = prepare_noniid_dataset(scaled_data, "state",
                                      data_path + "/quantum", pickle_path,
                                      nb_devices, double_check)

    return X, Y, dim + 1  # Because we added one column for the bias
Beispiel #8
0
def load_data(dataset: str, iid: str):
    """Loads a dataset.

    :param dataset: Name of the dataset
    :param iid: True if the dataset must not be splitted by target value
    :return: Train dataset, test dataset
    """
    path_to_dataset = '{0}/dataset/'.format(get_path_to_datasets())
    if dataset == "fake":

        transform = transforms.ToTensor()

        train_data = datasets.FakeData(size=200, transform=transform)

        test_data = datasets.FakeData(size=200, transform=transform)

    elif dataset == 'cifar10':

        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])

        transform_train = transforms.Compose([
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, 4),
            transforms.ToTensor(),
            normalize,
        ])

        transform_test = transforms.Compose([
            transforms.ToTensor(),
            normalize,
        ])

        train_data = datasets.CIFAR10(root=path_to_dataset, train=True, download=True, transform=transform_train)

        test_data = datasets.CIFAR10(root=path_to_dataset, train=False, download=True, transform=transform_test)

    elif dataset == 'mnist':

        # Normalization see : https://stackoverflow.com/a/67233938
        transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])
        train_data = datasets.MNIST(root=path_to_dataset, train=True, download=False, transform=transform)

        test_data = datasets.MNIST(root=path_to_dataset, train=False, download=False, transform=transform)

    elif dataset == "fashion_mnist":

        train_transforms = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])
        val_transforms = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ])

        # Download and load the training data
        train_data = datasets.FashionMNIST(path_to_dataset, download=True, train=True, transform=train_transforms)

        # Download and load the test data
        test_data = datasets.FashionMNIST(path_to_dataset, download=True, train=False, transform=val_transforms)

    elif dataset == "femnist":

        transform = transforms.Compose([transforms.ToTensor()])

        train_data = FEMNISTDataset(path_to_dataset, download=True, train=True, transform=transform)

        test_data = FEMNISTDataset(path_to_dataset, download=True, train=False, transform=transform)

    elif dataset == "a9a":

        train_data = A9ADataset(train=True, iid=iid)

        test_data = A9ADataset(train=False, iid=iid)

    elif dataset == "mushroom":

        train_data = MushroomDataset(train=True, iid=iid)

        test_data = MushroomDataset(train=False, iid=iid)

    elif dataset == "phishing":

        train_data = PhishingDataset(train=True, iid=iid)

        test_data = PhishingDataset(train=False, iid=iid)

    elif dataset == "quantum":
        train_data = QuantumDataset(train=True, iid=iid)

        test_data = QuantumDataset(train=False, iid=iid)

    return train_data, test_data