def getDatasets(self, percUsers, labels, size=None): logPrint("Loading COVIDx...") self._setRandomSeeds() data = self.__loadCOVIDxData(*size) trainDataframe, testDataframe = self._filterDataByLabel(labels, *data) clientDatasets = self._splitTrainDataIntoClientDatasets( percUsers, trainDataframe, self.COVIDxDataset) testDataset = self.COVIDxDataset(testDataframe, isTestDataset=True) return clientDatasets, testDataset
def test(self, testDataset): dataLoader = DataLoader(testDataset, shuffle=False) with torch.no_grad(): predLabels, testLabels = zip(*[(self.predict(self.model, x), y) for x, y in dataLoader]) predLabels = torch.tensor(predLabels, dtype=torch.long) testLabels = torch.tensor(testLabels, dtype=torch.long) # Confusion matrix and normalized confusion matrix mconf = confusion_matrix(testLabels, predLabels) errors = 1 - 1.0 * mconf.diagonal().sum() / len(testDataset) logPrint("Error Rate: ", round(100.0 * errors, 3), "%") return errors
def __loadCOVIDxData(self, trainSize, testSize): if self.__datasetNotFound(): logPrint("Can't find train|test split .txt files or " "/train, /test files not populated accordingly.") if not self.assembleDatasets: sys.exit(0) logPrint( "Proceeding to assemble dataset from downloaded resources.") self.__joinDatasets() trainDataframe = self.__readDataframe(self.trainCSV, trainSize) testDataframe = self.__readDataframe(self.testCSV, testSize) return trainDataframe, testDataframe
def trainAndTest(self, testDataset): roundsError = torch.zeros(self.rounds) for r in range(self.rounds): logPrint("Round... ", r) self._shareModelAndTrainOnClients() models = self._retrieveClientModelsDict() # Merge models self.model = self.__medianModels(models) roundsError[r] = self.test(testDataset) return roundsError
def trainAndTest(self, testDataset): roundsError = torch.zeros(self.rounds) for r in range(self.rounds): logPrint("Round... ", r) self._shareModelAndTrainOnClients() models = self._retrieveClientModelsDict() # Merge models comb = 0.0 for client in self.clients: self._mergeModels(models[client].to(self.device), self.model.to(self.device), client.p, comb) comb = 1.0 roundsError[r] = self.test(testDataset) return roundsError
def trainAndTest(self, testDataset): userNo = len(self.clients) # Number of Byzantine workers to be tolerated f = int((userNo - 3) / 2) th = userNo - f - 2 mk = userNo - f roundsError = torch.zeros(self.rounds) for r in range(self.rounds): logPrint("Round... ", r) self._shareModelAndTrainOnClients() # Compute distances for all users scores = torch.zeros(userNo) models = self._retrieveClientModelsDict() for client in self.clients: distances = torch.zeros((userNo, userNo)) for client2 in self.clients: if client.id != client2.id: distance = self.__computeModelDistance( models[client].to(self.device), models[client2].to(self.device)) distances[client.id - 1][client2.id - 1] = distance dd = distances[client.id - 1][:].sort()[0] dd = dd.cumsum(0) scores[client.id - 1] = dd[th] _, idx = scores.sort() selected_users = idx[:mk - 1] + 1 # logPrint("Selected users: ", selected_users) comb = 0.0 for client in self.clients: if client.id in selected_users: self._mergeModels(models[client].to(self.device), self.model.to(self.device), 1 / mk, comb) comb = 1.0 roundsError[r] = self.test(testDataset) return roundsError
def getDatasets(self, percUsers, labels, size=None): logPrint("Loading Heart Disease data...") self._setRandomSeeds() trainDataframe, testDataframe, columns = self.__loadHeartDiseaseData() trainDataframe, testDataframe = self._filterDataByLabel( labels, trainDataframe, testDataframe) clientDatasets = self._splitTrainDataIntoClientDatasets( percUsers, trainDataframe, self.HeartDiseaseDataset) testDataset = self.HeartDiseaseDataset(testDataframe) if self.requireDatasetAnonymization: clientAnonymizationResults = self._anonymizeClientDatasets( clientDatasets, columns, 4, self.quasiIds, self.__setHierarchies) clientDatasets, syntacticMappings, generalizedColumns = clientAnonymizationResults testDataset = self._anonymizeTestDataset(testDataset, syntacticMappings, columns, generalizedColumns) return clientDatasets, testDataset
def __joinDatasets(self): dataSources = [ '/covid-chestxray-dataset', '/rsna-kaggle-dataset', '/Figure1-covid-chestxray-dataset' ] if not len(os.listdir(self.dataPath + dataSources[0])): logPrint( "You need to clone https://github.com/ieee8023/covid-chestxray-dataset to {}." "".format(self.dataPath + dataSources[0])) exit(0) if not len(os.listdir(self.dataPath + dataSources[1])): logPrint( "You need to unzip (https://www.kaggle.com/c/rsna-pneumonia-detection-challenge) dataset to {}." "".format(self.dataPath + dataSources[1])) exit(0) COPY_FILE = True if COPY_FILE: if not os.path.exists(self.dataPath + '/train'): os.makedirs(self.dataPath + '/train') if not os.path.exists(self.dataPath + '/test'): os.makedirs(self.dataPath + '/test') # path to covid-19 dataset from https://github.com/ieee8023/covid-chestxray-dataset imgPath = self.dataPath + dataSources[0] + '/images' csvPath = self.dataPath + dataSources[0] + '/metadata.csv' # Path to https://www.kaggle.com/c/rsna-pneumonia-detection-challenge kaggle_dataPath = self.dataPath + '/rsna-kaggle-dataset' kaggle_csvname = 'stage_2_detailed_class_info.csv' # get all the normal from here kaggle_csvname2 = 'stage_2_train_labels.csv' # get all the 1s from here since 1 indicate pneumonia kaggle_imgPath = 'stage_2_train_images' # parameters for COVIDx dataset train = [] test = [] test_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0} train_count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0} mapping = dict() mapping['COVID-19'] = 'COVID-19' mapping['SARS'] = 'pneumonia' mapping['MERS'] = 'pneumonia' mapping['Streptococcus'] = 'pneumonia' mapping['Normal'] = 'normal' mapping['Lung Opacity'] = 'pneumonia' mapping['1'] = 'pneumonia' # train/test split split = 0.1 # adapted from https://github.com/mlmed/torchxrayvision/blob/master/torchxrayvision./datasets.py#L814 csv = pd.read_csv(csvPath, nrows=None) idx_pa = csv["view"] == "PA" # Keep only the PA view csv = csv[idx_pa] pneumonias = ["COVID-19", "SARS", "MERS", "ARDS", "Streptococcus"] pathologies = [ "Pneumonia", "Viral Pneumonia", "Bacterial Pneumonia", "No Finding" ] + pneumonias pathologies = sorted(pathologies) # get non-COVID19 viral, bacteria, and COVID-19 infections from covid-chestxray-dataset # stored as patient id, image filename and label filename_label = {'normal': [], 'pneumonia': [], 'COVID-19': []} count = {'normal': 0, 'pneumonia': 0, 'COVID-19': 0} print(csv.keys()) for index, row in csv.iterrows(): f = row['finding'] if f in mapping: count[mapping[f]] += 1 entry = [int(row['patientid']), row['filename'], mapping[f]] filename_label[mapping[f]].append(entry) print('Data distribution from covid-chestxray-dataset:') print(count) # add covid-chestxray-dataset into COVIDx dataset # since covid-chestxray-dataset doesn't have test dataset # split into train/test by patientid # for COVIDx: # patient 8 is used as non-COVID19 viral test # patient 31 is used as bacterial test # patients 19, 20, 36, 42, 86 are used as COVID-19 viral test for key in filename_label.keys(): arr = np.array(filename_label[key]) if arr.size == 0: continue # split by patients # num_diff_patients = len(np.unique(arr[:,0])) # num_test = max(1, round(split*num_diff_patients)) # select num_test number of random patients if key == 'pneumonia': test_patients = ['8', '31'] elif key == 'COVID-19': test_patients = ['19', '20', '36', '42', '86' ] # random.sample(list(arr[:,0]), num_test) else: test_patients = [] print('Key: ', key) print('Test patients: ', test_patients) # go through all the patients for patient in arr: if patient[0] in test_patients: if COPY_FILE: copyfile( os.path.join(imgPath, patient[1]), os.path.join(self.dataPath, 'test', patient[1])) test.append(patient) test_count[patient[2]] += 1 else: print("WARNING: passing copy file.") break else: if COPY_FILE: copyfile( os.path.join(imgPath, patient[1]), os.path.join(self.dataPath, 'train', patient[1])) train.append(patient) train_count[patient[2]] += 1 else: print("WARNING: passing copy file.") break print('test count: ', test_count) print('train count: ', train_count) # add normal and rest of pneumonia cases from https://www.kaggle.com/c/rsna-pneumonia-detection-challenge print(kaggle_dataPath) csv_normal = pd.read_csv(os.path.join(kaggle_dataPath, kaggle_csvname), nrows=None) csv_pneu = pd.read_csv(os.path.join(kaggle_dataPath, kaggle_csvname2), nrows=None) patients = {'normal': [], 'pneumonia': []} for index, row in csv_normal.iterrows(): if row['class'] == 'Normal': patients['normal'].append(row['patientId']) for index, row in csv_pneu.iterrows(): if int(row['Target']) == 1: patients['pneumonia'].append(row['patientId']) for key in patients.keys(): arr = np.array(patients[key]) if arr.size == 0: continue # split by patients # num_diff_patients = len(np.unique(arr)) # num_test = max(1, round(split*num_diff_patients)) # '/content/COVID-Net/' test_patients = np.load( self.dataPath + '/COVID-Net/rsna_test_patients_{}.npy' ''.format(key)) # random.sample(list(arr), num_test) # np.save('rsna_test_patients_{}.npy'.format(key), np.array(test_patients)) for patient in arr: ds = dicom.dcmread( os.path.join(kaggle_dataPath, kaggle_imgPath, patient + '.dcm')) pixel_array_numpy = ds.pixel_array imgname = patient + '.png' if patient in test_patients: if COPY_FILE: cv2.imwrite( os.path.join(self.dataPath, 'test', imgname), pixel_array_numpy) test.append([patient, imgname, key]) test_count[key] += 1 else: print("WARNING: passing copy file.") break else: if COPY_FILE: cv2.imwrite( os.path.join(self.dataPath, 'train', imgname), pixel_array_numpy) train.append([patient, imgname, key]) train_count[key] += 1 else: print("WARNING: passing copy file.") break print('test count: ', test_count) print('train count: ', train_count) # final stats print('Final stats') print('Train count: ', train_count) print('Test count: ', test_count) print('Total length of train: ', len(train)) print('Total length of test: ', len(test)) # export to train and test csv # format as patientid, filename, label - separated by a space train_file = open(self.dataPath + "/train_split_v2.txt", "w") for sample in train: info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n' train_file.write(info) train_file.close() test_file = open(self.dataPath + "/test_split_v2.txt", "w") for sample in test: info = str(sample[0]) + ' ' + sample[1] + ' ' + sample[2] + '\n' test_file.write(info) test_file.close()
def trainAndTest(self, testDataset): # List of malicious users blocked maliciousBlocked = [] # List with the iteration where a malicious user was blocked maliciousBlockedIt = [] # List of benign users blocked benignBlocked = [] # List with the iteration where a benign user was blocked benignBlockedIt = [] roundsError = torch.zeros(self.rounds) for r in range(self.rounds): logPrint("Round... ", r) for client in self.clients: broadcastModel = copy.deepcopy(self.model) client.updateModel(broadcastModel) if not client.blocked: error, pred = client.trainModel() models = self._retrieveClientModelsDict() badCount = 2 slack = self.xi while badCount != 0: pT_epoch = 0.0 for client in self.clients: if self.notBlockedNorBadUpdate(client): client.pEpoch = client.n * client.score pT_epoch = pT_epoch + client.pEpoch for client in self.clients: if self.notBlockedNorBadUpdate(client): client.pEpoch = client.pEpoch / pT_epoch comb = 0.0 for client in self.clients: if self.notBlockedNorBadUpdate(client): self._mergeModels(models[client].to(self.device), self.model.to(self.device), client.pEpoch, comb) comb = 1.0 sim = [] for client in self.clients: if self.notBlockedNorBadUpdate(client): client.sim = self.__modelSimilarity( self.model, models[client]) sim.append(np.asarray(client.sim.to("cpu"))) # logPrint("Similarity user ", u.id, ": ", u.sim) sim = np.asarray(sim) meanS = np.mean(sim) medianS = np.median(sim) desvS = np.std(sim) if meanS < medianS: th = medianS - slack * desvS else: th = medianS + slack * desvS slack += self.deltaXi badCount = 0 for client in self.clients: if not client.badUpdate: # Malicious self.clients are below the threshold if meanS < medianS: if client.sim < th: # logPrint("Type1") # logPrint("Bad update from user ", u.id) client.badUpdate = True badCount += 1 # Malicious self.clients are above the threshold else: if client.sim > th: client.badUpdate = True badCount += 1 pT = 0.0 for client in self.clients: if not client.blocked: self.updateUserScore(client) client.blocked = self.checkBlockedUser( client.alpha, client.beta) if client.blocked: logPrint("USER ", client.id, " BLOCKED!!!") client.p = 0 if client.byz: maliciousBlocked.append(client.id) maliciousBlockedIt.append(r) else: benignBlocked.append(client.id) benignBlockedIt.append(r) else: client.p = client.n * client.score pT = pT + client.p for client in self.clients: client.p = client.p / pT # logPrint("Weight user", u.id, ": ", round(u.p,3)) # Update model with the updated scores pT_epoch = 0.0 for client in self.clients: if self.notBlockedNorBadUpdate(client): client.pEpoch = client.n * client.score pT_epoch = pT_epoch + client.pEpoch for client in self.clients: if self.notBlockedNorBadUpdate(client): client.pEpoch = client.pEpoch / pT_epoch # logPrint("Updated scores:{}".format([client.pEpoch for client in self.clients])) comb = 0.0 for client in self.clients: if self.notBlockedNorBadUpdate(client): self._mergeModels(models[client].to(self.device), self.model.to(self.device), client.pEpoch, comb) comb = 1.0 # Reset badUpdate variable for client in self.clients: if not client.blocked: client.badUpdate = False roundsError[r] = self.test(testDataset) return roundsError