def run(self): print "Preparing the environment" self.prepareEnvironment() print "Reading in the training data" imageCollections = data_io.get_train_df() wndchrmWorker = WndchrmWorkerTrain() print "Getting features" if not self.loadWndchrm: #Last wndchrm set of features featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Saving images" imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes], imageCollections, featureGetter.patchSize, target[indexes]) imageSaver.saveImages() print "Executing wndchrm algorithm and extracting features" (train, target) = wndchrmWorker.executeWndchrm() else: (train, target) = wndchrmWorker.loadWndchrmFeatures() print "Training the model" model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True) model.fit(train, target) print model.feature_importances_ print "Saving the classifier" data_io.save_model(model)
def main(): classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0) classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0) classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski') classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10) classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0) classifier6 = GaussianNB() print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis = 1, inplace = True) #print train.head() train = train.values eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard') #eclf = EnsembleClassifier(clfs = [classifier2], voting = 'hard') scores = cross_val_score(estimator = eclf, X = train[0:,1:], y = train[0:,0], cv = 10, scoring = 'roc_auc') print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std())) eclf.fit(train[0:,1:],train[0:,0]) print("Saving the classifier") data_io.save_model(eclf)
def runWithoutWndchrm(self): print "Reading in the training data" imageCollections = data_io.get_train_df() print "Getting features" featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Training the model" classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier(n_neighbors=50) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) print "Saving the classifier" data_io.save_model(model)
def main(): classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0) classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0) classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski') classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10) classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0) classifier6 = GaussianNB() classifier7 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['Id'], axis = 1, inplace = True) #print train.head() train = train.values #eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard') #eclf = EnsembleClassifier(clfs = [classifier1], voting = 'hard') eclf = classifier3 #scores = cross_val_score(estimator = eclf, X = train[0:,0:-1], y = train[0:,-1], cv = 10, scoring = 'roc_auc') #print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std())) eclf.fit(train[0:,0:-1],train[0:,-1]) # importances = eclf.feature_importances_ # indices = np.argsort(importances)[::-1] # for f in range(train[0:,0:-1].shape[1]): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # print("Saving the classifier") data_io.save_model(eclf)
def get_cv_score(): classifier = data_io.load_model() train = data_io.get_train_df() scores = cv.cross_val_score(classifier, train[[x for x in train.columns if x != 'label']], train['label']) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main(): print("Reading in the training data") data = data_io.get_train_df() print("Extracting features") feature_extractor = Vectorizer(MAX_FEATURES) category_vectorizer = DictVectorizer() #category_title = pd.get_dummies(train['Title']) #print (category_vectorizer.shape, X.shape) X = form_input(data, feature_extractor, category_vectorizer) #location = pd.get_dummies(train['LocationNormalized']) #X = hstack((X, location)) #contract_time = pd.get_dummies(train['ContractTime']) #X = hstack((X, contract_time)) #print(X) y = data["SalaryNormalized"] print("Training model") linreg.train(X, y) print("Making predictions") predictions = linreg.predict(X) mae_train = metrics.MAE(predictions, data["SalaryNormalized"]) print('MAE train=%s', mae_train) print("Validating...") data = data_io.get_valid_df() X = form_input(data, feature_extractor, category_vectorizer, train=False) predictions = linreg.predict(X) data_io.write_submission(predictions) '''
def get_cv_score(): classifier = data_io.load_model() train = data_io.get_train_df() scores = cv.cross_val_score( classifier, train[[x for x in train.columns if x != 'label']], train['label']) print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") for key in train: classifier = get_pipeline(train[key]) classifier.fit(train[key], train[key]["SalaryNormalized"]) print("Saving the classifier for %s" %key) data_io.save_model(classifier,key)
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, train["SalaryNormalized"]) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train[[x for x in train.columns if x != 'label']], train['label']) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") classifier = get_pipeline(train) classifier.fit(train, train["SalaryNormalized"]) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.get_train_df() mean = train["SalaryNormalized"].mean() print("The mean salary is %f" % mean) print("Saving the model") data_io.save_model(mean) predictions = [mean] * len(train) print(metrics.MAE(predictions, train["SalaryNormalized"].tolist()))
def main(): markdown = PagedownToHtml() print("Reading in the training data") train = data_io.get_train_df() for i in train.index: train["BodyMarkdown"][i] = markdown.convert(train["BodyMarkdown"][i]) print("Extracting features and training") classifier = get_pipeline() classifier.fit(train, train["OpenStatus"]) print("Saving the classifier") data_io.save_model(classifier, "model.pickle") model = data_io.load_model("model.pickle")
def checkCandidates(self): imageCollections = data_io.get_train_df() featureGetter = FeatureGetter() (namesObservations, coordinates, train) = featureGetter.getTransformedDatasetChecking(imageCollections) imageNames = namesObservations currentImage = imageNames[0] csvArray = Utils.readcsv(imageNames[0]) mitoticPointsDetected = 0 totalMitoticPoints = len(csvArray) finalTrain = [] for i in range(len(coordinates)): if imageNames[i] != currentImage: csvArray = Utils.readcsv(imageNames[i]) totalMitoticPoints += len(csvArray) currentImage = imageNames[i] for point in csvArray: if ((point[0] - coordinates[i][0])**2 + (point[1] - coordinates[i][1])**2) < 30**2: mitoticPointsDetected += 1 csvArray.remove(point) finalTrain.append(train[i]) break finalTrain = np.array(finalTrain) allArea = finalTrain[:, 0] allPerimeter = finalTrain[:, 1] allRoundness = finalTrain[:, 2] totalObservations = len(coordinates) print "Minimum Area: %f" % np.min(allArea) print "Minimum Perimeter: %f" % np.min(allPerimeter) print "Minimum Roundness: %f" % np.min(allRoundness) print "Maximum Area: %f" % np.max(allArea) print "Maximum Perimeter: %f" % np.max(allPerimeter) print "Maximum Roundness: %f" % np.max(allRoundness) print "Total number of candidates: %d" % (totalObservations) print "Total number of mitotic points: %d" % (totalMitoticPoints) print "Mitotic points detected: %d" % (mitoticPointsDetected) print "Mitotic points missed: %d" % (totalMitoticPoints - mitoticPointsDetected)
def checkCandidates(self): imageCollections = data_io.get_train_df() featureGetter = FeatureGetter() (namesObservations, coordinates, train) = featureGetter.getTransformedDatasetChecking(imageCollections) imageNames = namesObservations currentImage = imageNames[0] csvArray = Utils.readcsv(imageNames[0]) mitoticPointsDetected = 0 totalMitoticPoints = len(csvArray) finalTrain = [] for i in range(len(coordinates)): if imageNames[i] != currentImage: csvArray = Utils.readcsv(imageNames[i]) totalMitoticPoints += len(csvArray) currentImage = imageNames[i] for point in csvArray: if ((point[0]-coordinates[i][0]) ** 2 + (point[1]-coordinates[i][1]) ** 2)< 30**2: mitoticPointsDetected += 1 csvArray.remove(point) finalTrain.append(train[i]) break finalTrain = np.array(finalTrain) allArea = finalTrain[:,0] allPerimeter = finalTrain[:,1] allRoundness = finalTrain[:,2] totalObservations = len(coordinates) print "Minimum Area: %f" % np.min(allArea) print "Minimum Perimeter: %f" % np.min(allPerimeter) print "Minimum Roundness: %f" % np.min(allRoundness) print "Maximum Area: %f" % np.max(allArea) print "Maximum Perimeter: %f" % np.max(allPerimeter) print "Maximum Roundness: %f" % np.max(allRoundness) print "Total number of candidates: %d" % (totalObservations) print "Total number of mitotic points: %d" %(totalMitoticPoints) print "Mitotic points detected: %d" %(mitoticPointsDetected) print "Mitotic points missed: %d" %(totalMitoticPoints-mitoticPointsDetected)
{ 'C': [0.001, 0.1, 1.0, 10.0, 100.0], 'penalty': ['l1', 'l2'] } svc_params_grid = \ { 'C': [0.001, 0.1, 1.0, 10.0, 100.0], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.1, 1.0, 10.0, 100.0] } if __name__ == "__main__": print("Reading in the training data") train = data_io.get_train_df() print( "Cleaning data. Check here for imputation, One hot encoding and factorization procedures.." ) train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis=1, inplace=True) #print train.head() train = train.values grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100), rf_params_grid, cv=5, verbose=1) grid_search.fit(train[0:, 1:], train[0:, 0])
{ 'C': [0.001, 0.1, 1.0, 10.0, 100.0], 'penalty': ['l1', 'l2'] } svc_params_grid = \ { 'C': [0.001, 0.1, 1.0, 10.0, 100.0], 'kernel': ['linear', 'rbf'], 'gamma': [0.001, 0.1, 1.0, 10.0, 100.0] } if __name__=="__main__": print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis = 1, inplace = True) #print train.head() train = train.values grid_search = GridSearchCV(RandomForestClassifier(n_estimators = 100), rf_params_grid, cv = 5, verbose = 1) grid_search.fit(train[0:,1:],train[0:,0]) print grid_search.best_params_ grid_search = GridSearchCV(LogisticRegression(random_state = 0), lr_params_grid, cv = 5, verbose = 1) grid_search.fit(train[0:,1:],train[0:,0])