def runWithoutWndchrm(self): print "Reading in the training data" imageCollections = data_io.get_train_df() print "Getting features" featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Training the model" classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier(n_neighbors=50) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) print "Saving the classifier" data_io.save_model(model)
def run(self): print "Preparing the environment" self.prepareEnvironment() print "Reading in the training data" imageCollections = data_io.get_train_df() wndchrmWorker = WndchrmWorkerTrain() print "Getting features" if not self.loadWndchrm: #Last wndchrm set of features featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Saving images" imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes], imageCollections, featureGetter.patchSize, target[indexes]) imageSaver.saveImages() print "Executing wndchrm algorithm and extracting features" (train, target) = wndchrmWorker.executeWndchrm() else: (train, target) = wndchrmWorker.loadWndchrmFeatures() print "Training the model" model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True) model.fit(train, target) print model.feature_importances_ print "Saving the classifier" data_io.save_model(model)
def runWithoutWndchrm(self): print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() print "Getting the features" fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, valid) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, valid) = Utils.loadFeatures(fileName) print "Making predictions" #valid = normalize(valid, axis=0) #askdfhashdf predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def checkCandidates(self): imageCollections = data_io.get_train_df() featureGetter = FeatureGetter() (namesObservations, coordinates, train) = featureGetter.getTransformedDatasetChecking(imageCollections) imageNames = namesObservations currentImage = imageNames[0] csvArray = Utils.readcsv(imageNames[0]) mitoticPointsDetected = 0 totalMitoticPoints = len(csvArray) finalTrain = [] for i in range(len(coordinates)): if imageNames[i] != currentImage: csvArray = Utils.readcsv(imageNames[i]) totalMitoticPoints += len(csvArray) currentImage = imageNames[i] for point in csvArray: if ((point[0] - coordinates[i][0])**2 + (point[1] - coordinates[i][1])**2) < 30**2: mitoticPointsDetected += 1 csvArray.remove(point) finalTrain.append(train[i]) break finalTrain = np.array(finalTrain) allArea = finalTrain[:, 0] allPerimeter = finalTrain[:, 1] allRoundness = finalTrain[:, 2] totalObservations = len(coordinates) print "Minimum Area: %f" % np.min(allArea) print "Minimum Perimeter: %f" % np.min(allPerimeter) print "Minimum Roundness: %f" % np.min(allRoundness) print "Maximum Area: %f" % np.max(allArea) print "Maximum Perimeter: %f" % np.max(allPerimeter) print "Maximum Roundness: %f" % np.max(allRoundness) print "Total number of candidates: %d" % (totalObservations) print "Total number of mitotic points: %d" % (totalMitoticPoints) print "Mitotic points detected: %d" % (mitoticPointsDetected) print "Mitotic points missed: %d" % (totalMitoticPoints - mitoticPointsDetected)
def checkCandidates(self): imageCollections = data_io.get_train_df() featureGetter = FeatureGetter() (namesObservations, coordinates, train) = featureGetter.getTransformedDatasetChecking(imageCollections) imageNames = namesObservations currentImage = imageNames[0] csvArray = Utils.readcsv(imageNames[0]) mitoticPointsDetected = 0 totalMitoticPoints = len(csvArray) finalTrain = [] for i in range(len(coordinates)): if imageNames[i] != currentImage: csvArray = Utils.readcsv(imageNames[i]) totalMitoticPoints += len(csvArray) currentImage = imageNames[i] for point in csvArray: if ((point[0]-coordinates[i][0]) ** 2 + (point[1]-coordinates[i][1]) ** 2)< 30**2: mitoticPointsDetected += 1 csvArray.remove(point) finalTrain.append(train[i]) break finalTrain = np.array(finalTrain) allArea = finalTrain[:,0] allPerimeter = finalTrain[:,1] allRoundness = finalTrain[:,2] totalObservations = len(coordinates) print "Minimum Area: %f" % np.min(allArea) print "Minimum Perimeter: %f" % np.min(allPerimeter) print "Minimum Roundness: %f" % np.min(allRoundness) print "Maximum Area: %f" % np.max(allArea) print "Maximum Perimeter: %f" % np.max(allPerimeter) print "Maximum Roundness: %f" % np.max(allRoundness) print "Total number of candidates: %d" % (totalObservations) print "Total number of mitotic points: %d" %(totalMitoticPoints) print "Mitotic points detected: %d" %(mitoticPointsDetected) print "Mitotic points missed: %d" %(totalMitoticPoints-mitoticPointsDetected)
def run(self): print "Preparing the environment" self.prepareEnvironment() print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() wndchrmWorker = WndchrmWorkerPredict() print "Getting the features" if not self.loadWndchrm: #Last wndchrm set of features fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, _) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, _) = Utils.loadFeatures(fileName) print "Saving images" imageSaver = ImageSaver(coordinates, namesObservations, imageCollections, featureGetter.patchSize) imageSaver.saveImages() print "Executing wndchrm algorithm" valid = wndchrmWorker.executeWndchrm(namesObservations) else: (valid, namesObservations) = wndchrmWorker.loadWndchrmFeatures() print "Making predictions" predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def run(self, k=3, useOnlyRF=True): featureGetter = FeatureGetter() fileNameTrain = data_io.get_savez_name() fileNameTest = data_io.get_savez_name_test() print "Merging files..." (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest) dataset = dataset[:,self.filterIndexes(len(dataset[0]))] print "Shuffling and splitting the data" indexesChanged = np.arange(len(dataset)) np.random.shuffle(indexesChanged) splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1) splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1) splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1) """Leave the last split for testing""" testNamesObs = splittedNamesObs[k] testCoords = splittedCoords[k] testDataset = splittedData[k] splittedNamesObs = splittedNamesObs[:k] splittedCoords = splittedCoords[:k] splittedData = splittedData[:k] del(dataset) del(coordinates) del(namesObservations) del(indexesChanged) bestModel = None bestFmeasure = 0 for i in range(k-1,-1,-1):#i is the index of the validation print "Doing cross-validation for i=%d" %i namesObservationsValid = splittedNamesObs[i] coordinatesValid = splittedCoords[i] datasetValid = splittedData[i] namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0]) namesObservationsTrain = self.getTrainData(splittedNamesObs,i) coordinatesTrain = self.getTrainData(splittedCoords,i) datasetTrain = self.getTrainData(splittedData, i) namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain) print "Selecting features" classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) if not useOnlyRF: importances = classifier.feature_importances_ filterImportances = np.where(importances > 0.0001)[0] print len(filterImportances) #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Training model" #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier() classifier = LinearSVC(verbose=1) #classifier = MLPClassifier(verbose=1) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes][:,filterImportances], target[indexes]) print "Making predictions" if not useOnlyRF: predictions = model.predict(datasetValid[:,filterImportances]) else: predictions = model.predict(datasetValid) predictions = predictions.reshape(len(predictions), 1) print "Calculating validation results" [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid) if fmeasure > bestFmeasure: bestFmeasure = fmeasure bestModel = model del(datasetTrain) del(datasetValid) del(coordinatesTrain) del(coordinatesValid) del(namesObservationsTrain) del(namesObservationsValid) print "Calculating final results" predictions = bestModel.predict(testDataset) print "The final score is: " testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0]) Predictor.finalResults(testNamesObs, predictions, testCoords)
def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True): featureGetter = FeatureGetter() overallTP = 0 overallFP = 0 overallFN = 0 fileNameTrain = data_io.get_savez_name() fileNameTest = data_io.get_savez_name_test() print "Merging files..." (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest) dataset = dataset[:,self.filterIndexes(len(dataset[0]))] print "Shuffling and splitting the data" indexesChanged = np.arange(len(dataset)) np.random.shuffle(indexesChanged) if patientSplit: k = 12 (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset) if breakin2: k = 2 (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData) else: splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k) splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k) splittedData = self.getShuffledSplits(dataset, indexesChanged, k) del(dataset) del(coordinates) del(namesObservations) del(indexesChanged) overallArrayTP = np.zeros(12) overallArrayFP = np.zeros(12) overallArrayFN = np.zeros(12) for i in range(k-1,-1,-1):#i is the index of the validation print "Doing cross-validation for i=%d" %i namesObservationsTest = splittedNamesObs[i] coordinatesTest = splittedCoords[i] datasetTest = splittedData[i] namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0]) namesObservationsTrain = self.getTrainData(splittedNamesObs,i) coordinatesTrain = self.getTrainData(splittedCoords,i) datasetTrain = self.getTrainData(splittedData, i) namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain) print "Selecting features" classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) if not useOnlyRF: importances = classifier.feature_importances_ filterImportances = np.where(importances > 0.0001)[0] print len(filterImportances) #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Training model" #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier() classifier = LinearSVC(verbose=1) #classifier = MLPClassifier(verbose=1) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes][:,filterImportances], target[indexes]) print "Making predictions" if not useOnlyRF: predictions = model.predict(datasetTest[:,filterImportances]) else: predictions = model.predict(datasetTest) predictions = predictions.reshape(len(predictions), 1) print "Calculating final results" [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest) print arrayTP print arrayFP print arrayFN overallArrayTP += arrayTP overallArrayFP += arrayFP overallArrayFN += arrayFN overallTP += truePositives overallFP += falsePositives overallFN += falseNegatives del(datasetTrain) del(datasetTest) del(coordinatesTrain) del(coordinatesTest) del(namesObservationsTrain) del(namesObservationsTest) precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0) recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0) fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision) print "Overall results for k=%d" %k print overallTP print overallFP print overallFN print precision print recall print fmeasure for i in range(len(overallArrayTP)): "Results for patient number %d:"% (i+1) overallTP = overallArrayTP[i] overallFP = overallArrayFP[i] overallFN = overallArrayFN[i] precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0) recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0) fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision) print precision print recall print fmeasure