def prepareEnvironment(self): # People want to save time trainingPathPositive = os.path.join(data_io.get_training_folder(), data_io.get_positive_folder()) trainingPathOldPositive = os.path.join(data_io.get_training_old_folder(), data_io.get_positive_folder()) Utils.shift(data_io.get_training_old_folder(), trainingPathOldPositive, data_io.get_positive_folder(), trainingPathPositive) trainingPathNegative = os.path.join(data_io.get_training_folder(), data_io.get_negative_folder()) trainingPathOldNegative = os.path.join(data_io.get_training_old_folder(), data_io.get_negative_folder()) Utils.shift(data_io.get_training_old_folder(), trainingPathOldNegative, data_io.get_negative_folder(), trainingPathNegative) os.mkdir(trainingPathPositive) os.mkdir(trainingPathNegative) if not self.load: Utils.shift('.', data_io.get_savez_name(), data_io.get_savez_name(), data_io.get_savez_name()) if not self.loadWndchrm: Utils.shift('.', data_io.get_wndchrm_dataset(), data_io.get_wndchrm_dataset(), data_io.get_wndchrm_dataset())
def run(self): print "Preparing the environment" self.prepareEnvironment() print "Reading in the training data" imageCollections = data_io.get_train_df() wndchrmWorker = WndchrmWorkerTrain() print "Getting features" if not self.loadWndchrm: #Last wndchrm set of features featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Saving images" imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes], imageCollections, featureGetter.patchSize, target[indexes]) imageSaver.saveImages() print "Executing wndchrm algorithm and extracting features" (train, target) = wndchrmWorker.executeWndchrm() else: (train, target) = wndchrmWorker.loadWndchrmFeatures() print "Training the model" model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True) model.fit(train, target) print model.feature_importances_ print "Saving the classifier" data_io.save_model(model)
def runWithoutWndchrm(self): print "Reading in the training data" imageCollections = data_io.get_train_df() print "Getting features" featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Training the model" classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier(n_neighbors=50) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) print "Saving the classifier" data_io.save_model(model)
def prepareEnvironment(self): # People want to save time trainingPathPositive = os.path.join(data_io.get_training_folder(), data_io.get_positive_folder()) trainingPathOldPositive = os.path.join( data_io.get_training_old_folder(), data_io.get_positive_folder()) Utils.shift(data_io.get_training_old_folder(), trainingPathOldPositive, data_io.get_positive_folder(), trainingPathPositive) trainingPathNegative = os.path.join(data_io.get_training_folder(), data_io.get_negative_folder()) trainingPathOldNegative = os.path.join( data_io.get_training_old_folder(), data_io.get_negative_folder()) Utils.shift(data_io.get_training_old_folder(), trainingPathOldNegative, data_io.get_negative_folder(), trainingPathNegative) os.mkdir(trainingPathPositive) os.mkdir(trainingPathNegative) if not self.load: Utils.shift('.', data_io.get_savez_name(), data_io.get_savez_name(), data_io.get_savez_name()) if not self.loadWndchrm: Utils.shift('.', data_io.get_wndchrm_dataset(), data_io.get_wndchrm_dataset(), data_io.get_wndchrm_dataset())
def run(self, k=3, useOnlyRF=True): featureGetter = FeatureGetter() fileNameTrain = data_io.get_savez_name() fileNameTest = data_io.get_savez_name_test() print "Merging files..." (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest) dataset = dataset[:,self.filterIndexes(len(dataset[0]))] print "Shuffling and splitting the data" indexesChanged = np.arange(len(dataset)) np.random.shuffle(indexesChanged) splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1) splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1) splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1) """Leave the last split for testing""" testNamesObs = splittedNamesObs[k] testCoords = splittedCoords[k] testDataset = splittedData[k] splittedNamesObs = splittedNamesObs[:k] splittedCoords = splittedCoords[:k] splittedData = splittedData[:k] del(dataset) del(coordinates) del(namesObservations) del(indexesChanged) bestModel = None bestFmeasure = 0 for i in range(k-1,-1,-1):#i is the index of the validation print "Doing cross-validation for i=%d" %i namesObservationsValid = splittedNamesObs[i] coordinatesValid = splittedCoords[i] datasetValid = splittedData[i] namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0]) namesObservationsTrain = self.getTrainData(splittedNamesObs,i) coordinatesTrain = self.getTrainData(splittedCoords,i) datasetTrain = self.getTrainData(splittedData, i) namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain) print "Selecting features" classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) if not useOnlyRF: importances = classifier.feature_importances_ filterImportances = np.where(importances > 0.0001)[0] print len(filterImportances) #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Training model" #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier() classifier = LinearSVC(verbose=1) #classifier = MLPClassifier(verbose=1) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes][:,filterImportances], target[indexes]) print "Making predictions" if not useOnlyRF: predictions = model.predict(datasetValid[:,filterImportances]) else: predictions = model.predict(datasetValid) predictions = predictions.reshape(len(predictions), 1) print "Calculating validation results" [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid) if fmeasure > bestFmeasure: bestFmeasure = fmeasure bestModel = model del(datasetTrain) del(datasetValid) del(coordinatesTrain) del(coordinatesValid) del(namesObservationsTrain) del(namesObservationsValid) print "Calculating final results" predictions = bestModel.predict(testDataset) print "The final score is: " testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0]) Predictor.finalResults(testNamesObs, predictions, testCoords)
def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True): featureGetter = FeatureGetter() overallTP = 0 overallFP = 0 overallFN = 0 fileNameTrain = data_io.get_savez_name() fileNameTest = data_io.get_savez_name_test() print "Merging files..." (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest) dataset = dataset[:,self.filterIndexes(len(dataset[0]))] print "Shuffling and splitting the data" indexesChanged = np.arange(len(dataset)) np.random.shuffle(indexesChanged) if patientSplit: k = 12 (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset) if breakin2: k = 2 (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData) else: splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k) splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k) splittedData = self.getShuffledSplits(dataset, indexesChanged, k) del(dataset) del(coordinates) del(namesObservations) del(indexesChanged) overallArrayTP = np.zeros(12) overallArrayFP = np.zeros(12) overallArrayFN = np.zeros(12) for i in range(k-1,-1,-1):#i is the index of the validation print "Doing cross-validation for i=%d" %i namesObservationsTest = splittedNamesObs[i] coordinatesTest = splittedCoords[i] datasetTest = splittedData[i] namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0]) namesObservationsTrain = self.getTrainData(splittedNamesObs,i) coordinatesTrain = self.getTrainData(splittedCoords,i) datasetTrain = self.getTrainData(splittedData, i) namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain) print "Selecting features" classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) if not useOnlyRF: importances = classifier.feature_importances_ filterImportances = np.where(importances > 0.0001)[0] print len(filterImportances) #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Training model" #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier() classifier = LinearSVC(verbose=1) #classifier = MLPClassifier(verbose=1) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes][:,filterImportances], target[indexes]) print "Making predictions" if not useOnlyRF: predictions = model.predict(datasetTest[:,filterImportances]) else: predictions = model.predict(datasetTest) predictions = predictions.reshape(len(predictions), 1) print "Calculating final results" [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest) print arrayTP print arrayFP print arrayFN overallArrayTP += arrayTP overallArrayFP += arrayFP overallArrayFN += arrayFN overallTP += truePositives overallFP += falsePositives overallFN += falseNegatives del(datasetTrain) del(datasetTest) del(coordinatesTrain) del(coordinatesTest) del(namesObservationsTrain) del(namesObservationsTest) precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0) recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0) fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision) print "Overall results for k=%d" %k print overallTP print overallFP print overallFN print precision print recall print fmeasure for i in range(len(overallArrayTP)): "Results for patient number %d:"% (i+1) overallTP = overallArrayTP[i] overallFP = overallArrayFP[i] overallFN = overallArrayFN[i] precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0) recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0) fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision) print precision print recall print fmeasure