Beispiel #1
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load:  #Last features calculated from candidates
         (namesObservations, coordinates,
          train) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
     else:
         (namesObservations, coordinates,
          train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target,
      obs) = featureGetter.getTargetVector(coordinates, namesObservations,
                                           train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500,
                                         verbose=2,
                                         n_jobs=1,
                                         min_samples_split=10,
                                         random_state=1,
                                         compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()),
                       ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Beispiel #2
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm: #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load: #Last features calculated from candidates
             (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
         else:
             (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize, target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Beispiel #3
0
 def runWithoutWndchrm(self):
     print "Loading the classifier"
     classifier = data_io.load_model()
     imageCollections = data_io.get_valid_df()
     featureGetter = FeatureGetter()
     print "Getting the features"
     fileName = data_io.get_savez_name_test()
     if not self.load:  #Last features calculated from candidates
         (namesObservations, coordinates,
          valid) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
     else:
         (namesObservations, coordinates,
          valid) = Utils.loadFeatures(fileName)
     print "Making predictions"
     #valid = normalize(valid, axis=0) #askdfhashdf
     predictions = classifier.predict(valid)
     predictions = predictions.reshape(len(predictions), 1)
     print "Writing predictions to file"
     data_io.write_submission(namesObservations, coordinates, predictions)
     data_io.write_submission_nice(namesObservations, coordinates,
                                   predictions)
     print "Calculating final results"
     return Predictor.finalResults(namesObservations, predictions,
                                   coordinates)
Beispiel #4
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load: #Last features calculated from candidates
         (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
     else:
         (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Beispiel #5
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm:  #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load:  #Last features calculated from candidates
             (namesObservations, coordinates,
              train) = Utils.calculateFeatures(fileName, featureGetter,
                                               imageCollections)
         else:
             (namesObservations, coordinates,
              train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target,
          obs) = featureGetter.getTargetVector(coordinates,
                                               namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes],
                                 namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize,
                                 target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500,
                                    verbose=2,
                                    n_jobs=1,
                                    min_samples_split=30,
                                    random_state=1,
                                    compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Beispiel #6
0
 def checkCandidates(self):
     imageCollections = data_io.get_train_df()
     featureGetter = FeatureGetter()
     (namesObservations, coordinates,
      train) = featureGetter.getTransformedDatasetChecking(imageCollections)
     imageNames = namesObservations
     currentImage = imageNames[0]
     csvArray = Utils.readcsv(imageNames[0])
     mitoticPointsDetected = 0
     totalMitoticPoints = len(csvArray)
     finalTrain = []
     for i in range(len(coordinates)):
         if imageNames[i] != currentImage:
             csvArray = Utils.readcsv(imageNames[i])
             totalMitoticPoints += len(csvArray)
             currentImage = imageNames[i]
         for point in csvArray:
             if ((point[0] - coordinates[i][0])**2 +
                 (point[1] - coordinates[i][1])**2) < 30**2:
                 mitoticPointsDetected += 1
                 csvArray.remove(point)
                 finalTrain.append(train[i])
                 break
     finalTrain = np.array(finalTrain)
     allArea = finalTrain[:, 0]
     allPerimeter = finalTrain[:, 1]
     allRoundness = finalTrain[:, 2]
     totalObservations = len(coordinates)
     print "Minimum Area: %f" % np.min(allArea)
     print "Minimum Perimeter: %f" % np.min(allPerimeter)
     print "Minimum Roundness: %f" % np.min(allRoundness)
     print "Maximum Area: %f" % np.max(allArea)
     print "Maximum Perimeter: %f" % np.max(allPerimeter)
     print "Maximum Roundness: %f" % np.max(allRoundness)
     print "Total number of candidates: %d" % (totalObservations)
     print "Total number of mitotic points: %d" % (totalMitoticPoints)
     print "Mitotic points detected: %d" % (mitoticPointsDetected)
     print "Mitotic points missed: %d" % (totalMitoticPoints -
                                          mitoticPointsDetected)
Beispiel #7
0
 def checkCandidates(self):
     imageCollections = data_io.get_train_df()
     featureGetter = FeatureGetter()
     (namesObservations, coordinates, train) = featureGetter.getTransformedDatasetChecking(imageCollections)
     imageNames = namesObservations
     currentImage = imageNames[0]
     csvArray = Utils.readcsv(imageNames[0])
     mitoticPointsDetected = 0
     totalMitoticPoints = len(csvArray)
     finalTrain = []
     for i in range(len(coordinates)):
         if imageNames[i] != currentImage:
             csvArray = Utils.readcsv(imageNames[i])
             totalMitoticPoints += len(csvArray)
             currentImage = imageNames[i]
         for point in csvArray:
             if ((point[0]-coordinates[i][0]) ** 2 + (point[1]-coordinates[i][1]) ** 2)< 30**2:
                 mitoticPointsDetected += 1
                 csvArray.remove(point)
                 finalTrain.append(train[i])
                 break
     finalTrain = np.array(finalTrain)
     allArea = finalTrain[:,0]
     allPerimeter = finalTrain[:,1]
     allRoundness = finalTrain[:,2]
     totalObservations = len(coordinates)
     print "Minimum Area: %f" % np.min(allArea)
     print "Minimum Perimeter: %f" % np.min(allPerimeter)
     print "Minimum Roundness: %f" % np.min(allRoundness)
     print "Maximum Area: %f" % np.max(allArea)
     print "Maximum Perimeter: %f" % np.max(allPerimeter)
     print "Maximum Roundness: %f" % np.max(allRoundness)
     print "Total number of candidates: %d" % (totalObservations)
     print "Total number of mitotic points: %d" %(totalMitoticPoints)
     print "Mitotic points detected: %d" %(mitoticPointsDetected)
     print "Mitotic points missed: %d" %(totalMitoticPoints-mitoticPointsDetected)
Beispiel #8
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Loading the classifier"
     classifier = data_io.load_model()
     imageCollections = data_io.get_valid_df()
     featureGetter = FeatureGetter()
     wndchrmWorker = WndchrmWorkerPredict()
     print "Getting the features"
     if not self.loadWndchrm:  #Last wndchrm set of features
         fileName = data_io.get_savez_name_test()
         if not self.load:  #Last features calculated from candidates
             (namesObservations, coordinates,
              _) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
         else:
             (namesObservations, coordinates,
              _) = Utils.loadFeatures(fileName)
         print "Saving images"
         imageSaver = ImageSaver(coordinates, namesObservations,
                                 imageCollections, featureGetter.patchSize)
         imageSaver.saveImages()
         print "Executing wndchrm algorithm"
         valid = wndchrmWorker.executeWndchrm(namesObservations)
     else:
         (valid, namesObservations) = wndchrmWorker.loadWndchrmFeatures()
     print "Making predictions"
     predictions = classifier.predict(valid)
     predictions = predictions.reshape(len(predictions), 1)
     print "Writing predictions to file"
     data_io.write_submission(namesObservations, coordinates, predictions)
     data_io.write_submission_nice(namesObservations, coordinates,
                                   predictions)
     print "Calculating final results"
     return Predictor.finalResults(namesObservations, predictions,
                                   coordinates)
Beispiel #9
0
    def run(self, k=3, useOnlyRF=True):
        featureGetter = FeatureGetter()
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)
        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1)
        splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1)
        splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1)
        
        """Leave the last split for testing"""
        testNamesObs = splittedNamesObs[k]
        testCoords = splittedCoords[k]
        testDataset = splittedData[k]
        
        splittedNamesObs = splittedNamesObs[:k]
        splittedCoords = splittedCoords[:k]
        splittedData = splittedData[:k]
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)

        bestModel = None
        bestFmeasure = 0
        
        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsValid = splittedNamesObs[i]
            coordinatesValid = splittedCoords[i]
            datasetValid = splittedData[i]
            namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetValid[:,filterImportances])
            else:
                predictions = model.predict(datasetValid)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating validation results"
            [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid)
            if fmeasure > bestFmeasure:
                bestFmeasure = fmeasure
                bestModel = model
            del(datasetTrain)
            del(datasetValid)
            del(coordinatesTrain)
            del(coordinatesValid)
            del(namesObservationsTrain)
            del(namesObservationsValid)
        
        print "Calculating final results"
        predictions = bestModel.predict(testDataset)
        print "The final score is: "
        testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0])
        Predictor.finalResults(testNamesObs, predictions, testCoords)
Beispiel #10
0
    def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True):
        featureGetter = FeatureGetter()
        overallTP = 0
        overallFP = 0
        overallFN = 0
        fileNameTrain = data_io.get_savez_name()
        fileNameTest = data_io.get_savez_name_test()
        print "Merging files..."
        (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest)

        dataset = dataset[:,self.filterIndexes(len(dataset[0]))]        
        print "Shuffling and splitting the data"
        indexesChanged = np.arange(len(dataset))
        np.random.shuffle(indexesChanged)
        if patientSplit:
            k = 12
            (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset)
            if breakin2:
                k = 2
                (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData)
        else:
            splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k)
            splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k)
            splittedData = self.getShuffledSplits(dataset, indexesChanged, k)
        
        del(dataset)
        del(coordinates)
        del(namesObservations)
        del(indexesChanged)
        
        overallArrayTP = np.zeros(12)
        overallArrayFP = np.zeros(12)
        overallArrayFN = np.zeros(12)

        for i in range(k-1,-1,-1):#i is the index of the validation
            print "Doing cross-validation for i=%d" %i    
            namesObservationsTest = splittedNamesObs[i]
            coordinatesTest = splittedCoords[i]
            datasetTest = splittedData[i]
            namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0])
            namesObservationsTrain = self.getTrainData(splittedNamesObs,i)
            coordinatesTrain = self.getTrainData(splittedCoords,i)
            datasetTrain = self.getTrainData(splittedData, i)
            namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0])
            print "Getting target vector"
    
            (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain)
        
            print "Selecting features"
            classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True)
            model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
            model.fit(obs[indexes], target[indexes])
            if not useOnlyRF:
                importances = classifier.feature_importances_
                filterImportances = np.where(importances > 0.0001)[0]
                print len(filterImportances)
                #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) 
                print "Training model"
                #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True)
                #classifier = KNeighborsClassifier()
                classifier = LinearSVC(verbose=1)
                #classifier = MLPClassifier(verbose=1)
                model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
                model.fit(obs[indexes][:,filterImportances], target[indexes])
            print "Making predictions"
            if not useOnlyRF:
                predictions = model.predict(datasetTest[:,filterImportances])
            else:
                predictions = model.predict(datasetTest)
            predictions = predictions.reshape(len(predictions), 1)
            print "Calculating final results"
            [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest)
            print arrayTP
            print arrayFP
            print arrayFN
            
            overallArrayTP += arrayTP
            overallArrayFP += arrayFP
            overallArrayFN += arrayFN
            overallTP += truePositives
            overallFP += falsePositives
            overallFN += falseNegatives
            del(datasetTrain)
            del(datasetTest)
            del(coordinatesTrain)
            del(coordinatesTest)
            del(namesObservationsTrain)
            del(namesObservationsTest)
        
        precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
        recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
        fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
        
        print "Overall results for k=%d" %k
        print overallTP
        print overallFP
        print overallFN
        print precision
        print recall
        print fmeasure
        
        for i in range(len(overallArrayTP)):
            "Results for patient number %d:"% (i+1)
            overallTP = overallArrayTP[i]
            overallFP = overallArrayFP[i]
            overallFN = overallArrayFN[i]
            precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0)
            recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0)
            fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision)
            print precision
            print recall
            print fmeasure