def runWithoutWndchrm(self): print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() print "Getting the features" fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, valid) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, valid) = Utils.loadFeatures(fileName) print "Making predictions" #valid = normalize(valid, axis=0) #askdfhashdf predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def prepareEnvironment(self): # People want to save time testingPath = os.path.join(data_io.get_testing_folder(), data_io.get_test_folder()) testingPathOld = os.path.join(data_io.get_testing_old_folder(), data_io.get_test_folder()) Utils.shift(data_io.get_testing_old_folder(), testingPathOld, data_io.get_test_folder(), testingPath) os.mkdir(testingPath) if not self.load: Utils.shift('.', data_io.get_savez_name_test(), data_io.get_savez_name_test(), data_io.get_savez_name_test()) if not self.loadWndchrm: Utils.shift('.', data_io.get_wndchrm_dataset_test(), data_io.get_wndchrm_dataset_test(), data_io.get_wndchrm_dataset_test())
def run(self): print "Preparing the environment" self.prepareEnvironment() print "Loading the classifier" classifier = data_io.load_model() imageCollections = data_io.get_valid_df() featureGetter = FeatureGetter() wndchrmWorker = WndchrmWorkerPredict() print "Getting the features" if not self.loadWndchrm: #Last wndchrm set of features fileName = data_io.get_savez_name_test() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, _) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, _) = Utils.loadFeatures(fileName) print "Saving images" imageSaver = ImageSaver(coordinates, namesObservations, imageCollections, featureGetter.patchSize) imageSaver.saveImages() print "Executing wndchrm algorithm" valid = wndchrmWorker.executeWndchrm(namesObservations) else: (valid, namesObservations) = wndchrmWorker.loadWndchrmFeatures() print "Making predictions" predictions = classifier.predict(valid) predictions = predictions.reshape(len(predictions), 1) print "Writing predictions to file" data_io.write_submission(namesObservations, coordinates, predictions) data_io.write_submission_nice(namesObservations, coordinates, predictions) print "Calculating final results" return Predictor.finalResults(namesObservations, predictions, coordinates)
def run(self, k=3, useOnlyRF=True): featureGetter = FeatureGetter() fileNameTrain = data_io.get_savez_name() fileNameTest = data_io.get_savez_name_test() print "Merging files..." (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest) dataset = dataset[:,self.filterIndexes(len(dataset[0]))] print "Shuffling and splitting the data" indexesChanged = np.arange(len(dataset)) np.random.shuffle(indexesChanged) splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k+1) splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k+1) splittedData = self.getShuffledSplits(dataset, indexesChanged, k+1) """Leave the last split for testing""" testNamesObs = splittedNamesObs[k] testCoords = splittedCoords[k] testDataset = splittedData[k] splittedNamesObs = splittedNamesObs[:k] splittedCoords = splittedCoords[:k] splittedData = splittedData[:k] del(dataset) del(coordinates) del(namesObservations) del(indexesChanged) bestModel = None bestFmeasure = 0 for i in range(k-1,-1,-1):#i is the index of the validation print "Doing cross-validation for i=%d" %i namesObservationsValid = splittedNamesObs[i] coordinatesValid = splittedCoords[i] datasetValid = splittedData[i] namesObservationsValid = np.reshape(namesObservationsValid, namesObservationsValid.shape[0]) namesObservationsTrain = self.getTrainData(splittedNamesObs,i) coordinatesTrain = self.getTrainData(splittedCoords,i) datasetTrain = self.getTrainData(splittedData, i) namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain) print "Selecting features" classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) if not useOnlyRF: importances = classifier.feature_importances_ filterImportances = np.where(importances > 0.0001)[0] print len(filterImportances) #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Training model" #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier() classifier = LinearSVC(verbose=1) #classifier = MLPClassifier(verbose=1) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes][:,filterImportances], target[indexes]) print "Making predictions" if not useOnlyRF: predictions = model.predict(datasetValid[:,filterImportances]) else: predictions = model.predict(datasetValid) predictions = predictions.reshape(len(predictions), 1) print "Calculating validation results" [_, _, _, _, _, fmeasure, _] = Predictor.finalResults(namesObservationsValid, predictions, coordinatesValid) if fmeasure > bestFmeasure: bestFmeasure = fmeasure bestModel = model del(datasetTrain) del(datasetValid) del(coordinatesTrain) del(coordinatesValid) del(namesObservationsTrain) del(namesObservationsValid) print "Calculating final results" predictions = bestModel.predict(testDataset) print "The final score is: " testNamesObs = np.reshape(testNamesObs, testNamesObs.shape[0]) Predictor.finalResults(testNamesObs, predictions, testCoords)
def run(self, k=3, patientSplit=True, useOnlyRF=True, breakin2=True): featureGetter = FeatureGetter() overallTP = 0 overallFP = 0 overallFN = 0 fileNameTrain = data_io.get_savez_name() fileNameTest = data_io.get_savez_name_test() print "Merging files..." (namesObservations, coordinates, dataset) = self.mergeFiles(fileNameTrain, fileNameTest) dataset = dataset[:,self.filterIndexes(len(dataset[0]))] print "Shuffling and splitting the data" indexesChanged = np.arange(len(dataset)) np.random.shuffle(indexesChanged) if patientSplit: k = 12 (splittedNamesObs, splittedCoords, splittedData) = self.getSplits(namesObservations, coordinates, dataset) if breakin2: k = 2 (splittedNamesObs, splittedCoords, splittedData) = self.getNewSplits(splittedNamesObs, splittedCoords, splittedData) else: splittedNamesObs = self.getShuffledSplits(namesObservations, indexesChanged, k) splittedCoords = self.getShuffledSplits(coordinates, indexesChanged, k) splittedData = self.getShuffledSplits(dataset, indexesChanged, k) del(dataset) del(coordinates) del(namesObservations) del(indexesChanged) overallArrayTP = np.zeros(12) overallArrayFP = np.zeros(12) overallArrayFN = np.zeros(12) for i in range(k-1,-1,-1):#i is the index of the validation print "Doing cross-validation for i=%d" %i namesObservationsTest = splittedNamesObs[i] coordinatesTest = splittedCoords[i] datasetTest = splittedData[i] namesObservationsTest = np.reshape(namesObservationsTest, namesObservationsTest.shape[0]) namesObservationsTrain = self.getTrainData(splittedNamesObs,i) coordinatesTrain = self.getTrainData(splittedCoords,i) datasetTrain = self.getTrainData(splittedData, i) namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinatesTrain, namesObservationsTrain, datasetTrain) print "Selecting features" classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=1, random_state=1, compute_importances=True) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) if not useOnlyRF: importances = classifier.feature_importances_ filterImportances = np.where(importances > 0.0001)[0] print len(filterImportances) #namesObservationsTrain = np.reshape(namesObservationsTrain, namesObservationsTrain.shape[0]) print "Training model" #classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=100, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier() classifier = LinearSVC(verbose=1) #classifier = MLPClassifier(verbose=1) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes][:,filterImportances], target[indexes]) print "Making predictions" if not useOnlyRF: predictions = model.predict(datasetTest[:,filterImportances]) else: predictions = model.predict(datasetTest) predictions = predictions.reshape(len(predictions), 1) print "Calculating final results" [truePositives, falsePositives, falseNegatives, _, _, _, (arrayTP, arrayFP, arrayFN)] = Predictor.finalResults(namesObservationsTest, predictions, coordinatesTest) print arrayTP print arrayFP print arrayFN overallArrayTP += arrayTP overallArrayFP += arrayFP overallArrayFN += arrayFN overallTP += truePositives overallFP += falsePositives overallFN += falseNegatives del(datasetTrain) del(datasetTest) del(coordinatesTrain) del(coordinatesTest) del(namesObservationsTrain) del(namesObservationsTest) precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0) recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0) fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision) print "Overall results for k=%d" %k print overallTP print overallFP print overallFN print precision print recall print fmeasure for i in range(len(overallArrayTP)): "Results for patient number %d:"% (i+1) overallTP = overallArrayTP[i] overallFP = overallArrayFP[i] overallFN = overallArrayFN[i] precision = 0 if overallTP+overallFP == 0 else (overallTP+0.0)/(overallTP+overallFP+0.0) recall = 0 if overallTP+overallFN == 0 else (overallTP+0.0)/(overallTP+overallFN+0.0) fmeasure = 0 if recall+precision == 0 else 2*(precision*recall)/(recall+precision) print precision print recall print fmeasure