Esempio n. 1
0
 def getDatasets(self, percUsers, labels, size=None):
     logPrint("Loading COVIDx...")
     self._setRandomSeeds()
     data = self.__loadCOVIDxData(*size)
     trainDataframe, testDataframe = self._filterDataByLabel(labels, *data)
     clientDatasets = self._splitTrainDataIntoClientDatasets(
         percUsers, trainDataframe, self.COVIDxDataset)
     testDataset = self.COVIDxDataset(testDataframe, isTestDataset=True)
     return clientDatasets, testDataset
Esempio n. 2
0
 def test(self, testDataset):
     dataLoader = DataLoader(testDataset, shuffle=False)
     with torch.no_grad():
         predLabels, testLabels = zip(*[(self.predict(self.model, x), y)
                                        for x, y in dataLoader])
     predLabels = torch.tensor(predLabels, dtype=torch.long)
     testLabels = torch.tensor(testLabels, dtype=torch.long)
     # Confusion matrix and normalized confusion matrix
     mconf = confusion_matrix(testLabels, predLabels)
     errors = 1 - 1.0 * mconf.diagonal().sum() / len(testDataset)
     logPrint("Error Rate: ", round(100.0 * errors, 3), "%")
     return errors
Esempio n. 3
0
    def __loadCOVIDxData(self, trainSize, testSize):
        if self.__datasetNotFound():
            logPrint("Can't find train|test split .txt files or "
                     "/train, /test files not populated accordingly.")
            if not self.assembleDatasets:
                sys.exit(0)

            logPrint(
                "Proceeding to assemble dataset from downloaded resources.")
            self.__joinDatasets()

        trainDataframe = self.__readDataframe(self.trainCSV, trainSize)
        testDataframe = self.__readDataframe(self.testCSV, testSize)

        return trainDataframe, testDataframe
Esempio n. 4
0
    def trainAndTest(self, testDataset):
        roundsError = torch.zeros(self.rounds)

        for r in range(self.rounds):
            logPrint("Round... ", r)

            self._shareModelAndTrainOnClients()
            models = self._retrieveClientModelsDict()

            # Merge models
            self.model = self.__medianModels(models)

            roundsError[r] = self.test(testDataset)

        return roundsError
Esempio n. 5
0
    def trainAndTest(self, testDataset):
        roundsError = torch.zeros(self.rounds)
        for r in range(self.rounds):
            logPrint("Round... ", r)
            self._shareModelAndTrainOnClients()
            models = self._retrieveClientModelsDict()
            # Merge models
            comb = 0.0
            for client in self.clients:
                self._mergeModels(models[client].to(self.device),
                                  self.model.to(self.device), client.p, comb)
                comb = 1.0

            roundsError[r] = self.test(testDataset)

        return roundsError
Esempio n. 6
0
    def trainAndTest(self, testDataset):
        userNo = len(self.clients)
        # Number of Byzantine workers to be tolerated
        f = int((userNo - 3) / 2)
        th = userNo - f - 2
        mk = userNo - f

        roundsError = torch.zeros(self.rounds)

        for r in range(self.rounds):
            logPrint("Round... ", r)

            self._shareModelAndTrainOnClients()

            # Compute distances for all users
            scores = torch.zeros(userNo)
            models = self._retrieveClientModelsDict()
            for client in self.clients:
                distances = torch.zeros((userNo, userNo))
                for client2 in self.clients:
                    if client.id != client2.id:
                        distance = self.__computeModelDistance(
                            models[client].to(self.device),
                            models[client2].to(self.device))
                        distances[client.id - 1][client2.id - 1] = distance
                dd = distances[client.id - 1][:].sort()[0]
                dd = dd.cumsum(0)
                scores[client.id - 1] = dd[th]

            _, idx = scores.sort()
            selected_users = idx[:mk - 1] + 1
            # logPrint("Selected users: ", selected_users)

            comb = 0.0
            for client in self.clients:
                if client.id in selected_users:
                    self._mergeModels(models[client].to(self.device),
                                      self.model.to(self.device), 1 / mk, comb)
                    comb = 1.0

            roundsError[r] = self.test(testDataset)

        return roundsError
Esempio n. 7
0
    def getDatasets(self, percUsers, labels, size=None):
        logPrint("Loading Heart Disease data...")
        self._setRandomSeeds()
        trainDataframe, testDataframe, columns = self.__loadHeartDiseaseData()
        trainDataframe, testDataframe = self._filterDataByLabel(
            labels, trainDataframe, testDataframe)
        clientDatasets = self._splitTrainDataIntoClientDatasets(
            percUsers, trainDataframe, self.HeartDiseaseDataset)
        testDataset = self.HeartDiseaseDataset(testDataframe)

        if self.requireDatasetAnonymization:
            clientAnonymizationResults = self._anonymizeClientDatasets(
                clientDatasets, columns, 4, self.quasiIds,
                self.__setHierarchies)
            clientDatasets, syntacticMappings, generalizedColumns = clientAnonymizationResults
            testDataset = self._anonymizeTestDataset(testDataset,
                                                     syntacticMappings,
                                                     columns,
                                                     generalizedColumns)

        return clientDatasets, testDataset
Esempio n. 8
0
    def __joinDatasets(self):
        dataSources = [
            '/covid-chestxray-dataset', '/rsna-kaggle-dataset',
            '/Figure1-covid-chestxray-dataset'
        ]
        if not len(os.listdir(self.dataPath + dataSources[0])):
            logPrint(
                "You need to clone https://github.com/ieee8023/covid-chestxray-dataset to {}."
                "".format(self.dataPath + dataSources[0]))
            exit(0)
        if not len(os.listdir(self.dataPath + dataSources[1])):
            logPrint(
                "You need to unzip (https://www.kaggle.com/c/rsna-pneumonia-detection-challenge) dataset to {}."
                "".format(self.dataPath + dataSources[1]))
            exit(0)

        COPY_FILE = True
        if COPY_FILE:
            if not os.path.exists(self.dataPath + '/train'):
                os.makedirs(self.dataPath + '/train')
            if not os.path.exists(self.dataPath + '/test'):
                os.makedirs(self.dataPath + '/test')

        # path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset
        imgPath = self.dataPath + dataSources[0] + '/images'
        csvPath = self.dataPath + dataSources[0] + '/metadata.csv'

        # Path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge
        kaggle_dataPath = self.dataPath + '/rsna-kaggle-dataset'
        kaggle_csvname = 'stage_2_detailed_class_info.csv'  # get all the normal from here
        kaggle_csvname2 = 'stage_2_train_labels.csv'  # get all the 1s from here since 1 indicate pneumonia
        kaggle_imgPath = 'stage_2_train_images'

        # parameters for COVIDx dataset
        train = []
        test = []
        test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
        train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}

        mapping = dict()
        mapping['COVID-19'] = 'COVID-19'
        mapping['SARS'] = 'pneumonia'
        mapping['MERS'] = 'pneumonia'
        mapping['Streptococcus'] = 'pneumonia'
        mapping['Normal'] = 'normal'
        mapping['Lung Opacity'] = 'pneumonia'
        mapping['1'] = 'pneumonia'

        # train/test split
        split = 0.1

        # adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision./datasets.py#L814
        csv = pd.read_csv(csvPath, nrows=None)
        idx_pa = csv["view"] == "PA"  # Keep only the PA view
        csv = csv[idx_pa]

        pneumonias = ["COVID-19", "SARS", "MERS", "ARDS", "Streptococcus"]
        pathologies = [
            "Pneumonia", "Viral Pneumonia", "Bacterial Pneumonia", "No Finding"
        ] + pneumonias
        pathologies = sorted(pathologies)

        # get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset
        # stored as patient id, image filename and label
        filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []}
        count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0}
        print(csv.keys())
        for index, row in csv.iterrows():
            f = row['finding']
            if f in mapping:
                count[mapping[f]] += 1
                entry = [int(row['patientid']), row['filename'], mapping[f]]
                filename_label[mapping[f]].append(entry)

        print('Data distribution from covid-chestxray-dataset:')
        print(count)

        # add covid-chestxray-dataset into COVIDx dataset
        # since covid-chestxray-dataset doesn't have test dataset
        # split into train/test by patientid
        # for COVIDx:
        # patient 8 is used as non-COVID19 viral test
        # patient 31 is used as bacterial test
        # patients 19, 20, 36, 42, 86 are used as COVID-19 viral test

        for key in filename_label.keys():
            arr = np.array(filename_label[key])
            if arr.size == 0:
                continue
            # split by patients
            # num_diff_patients = len(np.unique(arr[:,0]))
            # num_test = max(1, round(split*num_diff_patients))
            # select num_test number of random patients
            if key == 'pneumonia':
                test_patients = ['8', '31']
            elif key == 'COVID-19':
                test_patients = ['19', '20', '36', '42', '86'
                                 ]  # random.sample(list(arr[:,0]), num_test)
            else:
                test_patients = []
            print('Key: ', key)
            print('Test patients: ', test_patients)
            # go through all the patients
            for patient in arr:
                if patient[0] in test_patients:
                    if COPY_FILE:
                        copyfile(
                            os.path.join(imgPath, patient[1]),
                            os.path.join(self.dataPath, 'test', patient[1]))
                        test.append(patient)
                        test_count[patient[2]] += 1
                    else:
                        print("WARNING: passing copy file.")
                        break
                else:
                    if COPY_FILE:
                        copyfile(
                            os.path.join(imgPath, patient[1]),
                            os.path.join(self.dataPath, 'train', patient[1]))
                        train.append(patient)
                        train_count[patient[2]] += 1

                    else:
                        print("WARNING: passing copy file.")
                        break

        print('test count: ', test_count)
        print('train count: ', train_count)

        # add normal and rest of pneumonia cases from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge

        print(kaggle_dataPath)
        csv_normal = pd.read_csv(os.path.join(kaggle_dataPath, kaggle_csvname),
                                 nrows=None)
        csv_pneu = pd.read_csv(os.path.join(kaggle_dataPath, kaggle_csvname2),
                               nrows=None)
        patients = {'normal': [], 'pneumonia': []}

        for index, row in csv_normal.iterrows():
            if row['class'] == 'Normal':
                patients['normal'].append(row['patientId'])

        for index, row in csv_pneu.iterrows():
            if int(row['Target']) == 1:
                patients['pneumonia'].append(row['patientId'])

        for key in patients.keys():
            arr = np.array(patients[key])
            if arr.size == 0:
                continue
            # split by patients
            # num_diff_patients = len(np.unique(arr))
            # num_test = max(1, round(split*num_diff_patients))
            # '/content/COVID-Net/'
            test_patients = np.load(
                self.dataPath + '/COVID-Net/rsna_test_patients_{}.npy'
                ''.format(key))  # random.sample(list(arr), num_test)
            # np.save('rsna_test_patients_{}.npy'.format(key), np.array(test_patients))
            for patient in arr:
                ds = dicom.dcmread(
                    os.path.join(kaggle_dataPath, kaggle_imgPath,
                                 patient + '.dcm'))
                pixel_array_numpy = ds.pixel_array
                imgname = patient + '.png'
                if patient in test_patients:
                    if COPY_FILE:
                        cv2.imwrite(
                            os.path.join(self.dataPath, 'test', imgname),
                            pixel_array_numpy)
                        test.append([patient, imgname, key])
                        test_count[key] += 1
                    else:
                        print("WARNING: passing copy file.")
                        break
                else:
                    if COPY_FILE:
                        cv2.imwrite(
                            os.path.join(self.dataPath, 'train', imgname),
                            pixel_array_numpy)
                        train.append([patient, imgname, key])
                        train_count[key] += 1
                    else:
                        print("WARNING: passing copy file.")
                        break
        print('test count: ', test_count)
        print('train count: ', train_count)

        # final stats
        print('Final stats')
        print('Train count: ', train_count)
        print('Test count: ', test_count)
        print('Total length of train: ', len(train))
        print('Total length of test: ', len(test))

        # export to train and test csv
        # format as patientid, filename, label - separated by a space
        train_file = open(self.dataPath + "/train_split_v2.txt", "w")
        for sample in train:
            info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
            train_file.write(info)
        train_file.close()

        test_file = open(self.dataPath + "/test_split_v2.txt", "w")
        for sample in test:
            info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n'
            test_file.write(info)
        test_file.close()
Esempio n. 9
0
    def trainAndTest(self, testDataset):
        # List of malicious users blocked
        maliciousBlocked = []
        # List with the iteration where a malicious user was blocked
        maliciousBlockedIt = []
        # List of benign users blocked
        benignBlocked = []
        # List with the iteration where a benign user was blocked
        benignBlockedIt = []

        roundsError = torch.zeros(self.rounds)

        for r in range(self.rounds):

            logPrint("Round... ", r)

            for client in self.clients:
                broadcastModel = copy.deepcopy(self.model)
                client.updateModel(broadcastModel)
                if not client.blocked:
                    error, pred = client.trainModel()

            models = self._retrieveClientModelsDict()

            badCount = 2
            slack = self.xi
            while badCount != 0:
                pT_epoch = 0.0
                for client in self.clients:
                    if self.notBlockedNorBadUpdate(client):
                        client.pEpoch = client.n * client.score
                        pT_epoch = pT_epoch + client.pEpoch

                for client in self.clients:
                    if self.notBlockedNorBadUpdate(client):
                        client.pEpoch = client.pEpoch / pT_epoch

                comb = 0.0
                for client in self.clients:
                    if self.notBlockedNorBadUpdate(client):
                        self._mergeModels(models[client].to(self.device),
                                          self.model.to(self.device),
                                          client.pEpoch, comb)
                        comb = 1.0

                sim = []
                for client in self.clients:
                    if self.notBlockedNorBadUpdate(client):
                        client.sim = self.__modelSimilarity(
                            self.model, models[client])
                        sim.append(np.asarray(client.sim.to("cpu")))
                        # logPrint("Similarity user ", u.id, ": ", u.sim)

                sim = np.asarray(sim)

                meanS = np.mean(sim)
                medianS = np.median(sim)
                desvS = np.std(sim)

                if meanS < medianS:
                    th = medianS - slack * desvS
                else:
                    th = medianS + slack * desvS

                slack += self.deltaXi

                badCount = 0
                for client in self.clients:
                    if not client.badUpdate:
                        # Malicious self.clients are below the threshold
                        if meanS < medianS:
                            if client.sim < th:
                                # logPrint("Type1")
                                # logPrint("Bad update from user ", u.id)
                                client.badUpdate = True
                                badCount += 1
                                # Malicious self.clients are above the threshold
                        else:
                            if client.sim > th:
                                client.badUpdate = True
                                badCount += 1

            pT = 0.0
            for client in self.clients:
                if not client.blocked:
                    self.updateUserScore(client)
                    client.blocked = self.checkBlockedUser(
                        client.alpha, client.beta)
                    if client.blocked:
                        logPrint("USER ", client.id, " BLOCKED!!!")
                        client.p = 0
                        if client.byz:
                            maliciousBlocked.append(client.id)
                            maliciousBlockedIt.append(r)
                        else:
                            benignBlocked.append(client.id)
                            benignBlockedIt.append(r)
                    else:
                        client.p = client.n * client.score
                        pT = pT + client.p

            for client in self.clients:
                client.p = client.p / pT
                # logPrint("Weight user", u.id, ": ", round(u.p,3))

            # Update model with the updated scores
            pT_epoch = 0.0
            for client in self.clients:
                if self.notBlockedNorBadUpdate(client):
                    client.pEpoch = client.n * client.score
                    pT_epoch = pT_epoch + client.pEpoch

            for client in self.clients:
                if self.notBlockedNorBadUpdate(client):
                    client.pEpoch = client.pEpoch / pT_epoch
            # logPrint("Updated scores:{}".format([client.pEpoch for client in self.clients]))
            comb = 0.0
            for client in self.clients:
                if self.notBlockedNorBadUpdate(client):
                    self._mergeModels(models[client].to(self.device),
                                      self.model.to(self.device),
                                      client.pEpoch, comb)
                    comb = 1.0

            # Reset badUpdate variable
            for client in self.clients:
                if not client.blocked:
                    client.badUpdate = False

            roundsError[r] = self.test(testDataset)

        return roundsError