Ejemplo n.º 1
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm: #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load: #Last features calculated from candidates
             (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
         else:
             (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize, target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Ejemplo n.º 2
0
def main():
    
    classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0)
    classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0)
    classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski')
    classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10)
    classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0)
    classifier6 = GaussianNB()
        
    print("Reading in the training data")
    train = data_io.get_train_df()
    
    print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..")
    train = FeatureConverter().clean_data(train)
    train.drop(['PassengerId'], axis = 1, inplace = True)
    #print train.head()
    train = train.values
    
    eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard')
    #eclf = EnsembleClassifier(clfs = [classifier2], voting = 'hard')    
    scores = cross_val_score(estimator = eclf, X = train[0:,1:], y = train[0:,0], cv = 10, scoring = 'roc_auc')
    
    print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std()))
    eclf.fit(train[0:,1:],train[0:,0])

    print("Saving the classifier")
    data_io.save_model(eclf)
Ejemplo n.º 3
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load:  #Last features calculated from candidates
         (namesObservations, coordinates,
          train) = Utils.calculateFeatures(fileName, featureGetter,
                                           imageCollections)
     else:
         (namesObservations, coordinates,
          train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target,
      obs) = featureGetter.getTargetVector(coordinates, namesObservations,
                                           train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500,
                                         verbose=2,
                                         n_jobs=1,
                                         min_samples_split=10,
                                         random_state=1,
                                         compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()),
                       ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Ejemplo n.º 4
0
def main():
    
    classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0)
    classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0)
    classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski')
    classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10)
    classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0)
    classifier6 = GaussianNB()
    classifier7 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
          
    print("Reading in the training data")
    train = data_io.get_train_df()
    
    print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..")
    train = FeatureConverter().clean_data(train)
    train.drop(['Id'], axis = 1, inplace = True)
    #print train.head()
    train = train.values
    
    #eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard')
    #eclf = EnsembleClassifier(clfs = [classifier1], voting = 'hard')
    eclf = classifier3
    #scores = cross_val_score(estimator = eclf, X = train[0:,0:-1], y = train[0:,-1], cv = 10, scoring = 'roc_auc')
    
    #print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std()))
    eclf.fit(train[0:,0:-1],train[0:,-1])

#     importances = eclf.feature_importances_
#     indices = np.argsort(importances)[::-1]
#     for f in range(train[0:,0:-1].shape[1]):
#         print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
#         
    print("Saving the classifier")
    data_io.save_model(eclf)
Ejemplo n.º 5
0
def get_cv_score():

    classifier = data_io.load_model()
    train = data_io.get_train_df()
    scores = cv.cross_val_score(classifier, train[[x for x in train.columns if x != 'label']], train['label'])

    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Ejemplo n.º 6
0
def main():
    print("Reading in the training data")
    data = data_io.get_train_df()
    print("Extracting features")
    feature_extractor = Vectorizer(MAX_FEATURES)
    category_vectorizer = DictVectorizer()


    #category_title = pd.get_dummies(train['Title'])
    #print (category_vectorizer.shape, X.shape)

    X = form_input(data, feature_extractor, category_vectorizer)
    #location = pd.get_dummies(train['LocationNormalized'])
    #X = hstack((X, location))
    #contract_time = pd.get_dummies(train['ContractTime'])
    #X = hstack((X, contract_time))
    #print(X)
    y = data["SalaryNormalized"]
    print("Training model")
    linreg.train(X, y)
    print("Making predictions")
    predictions = linreg.predict(X)
    mae_train = metrics.MAE(predictions, data["SalaryNormalized"])
    print('MAE train=%s', mae_train)


    print("Validating...")

    data = data_io.get_valid_df()
    X = form_input(data, feature_extractor, category_vectorizer, train=False)
    predictions = linreg.predict(X)
    data_io.write_submission(predictions)

    '''
Ejemplo n.º 7
0
def get_cv_score():

    classifier = data_io.load_model()
    train = data_io.get_train_df()
    scores = cv.cross_val_score(
        classifier, train[[x for x in train.columns if x != 'label']],
        train['label'])

    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
Ejemplo n.º 8
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    for key in train:
        classifier = get_pipeline(train[key])
        classifier.fit(train[key], train[key]["SalaryNormalized"])
        print("Saving the classifier for %s" %key)
        data_io.save_model(classifier,key)
Ejemplo n.º 9
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, train["SalaryNormalized"])

    print("Saving the classifier")
    data_io.save_model(classifier)
Ejemplo n.º 10
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train[[x for x in train.columns if x != 'label']], train['label'])

    print("Saving the classifier")
    data_io.save_model(classifier)
Ejemplo n.º 11
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    classifier = get_pipeline(train)
    classifier.fit(train, train["SalaryNormalized"])

    print("Saving the classifier")
    data_io.save_model(classifier)
Ejemplo n.º 12
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train[[x for x in train.columns if x != 'label']],
                   train['label'])

    print("Saving the classifier")
    data_io.save_model(classifier)
Ejemplo n.º 13
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    mean = train["SalaryNormalized"].mean()
    print("The mean salary is %f" % mean)

    print("Saving the model")
    data_io.save_model(mean)

    predictions = [mean] * len(train)
    print(metrics.MAE(predictions, train["SalaryNormalized"].tolist()))
Ejemplo n.º 14
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    mean = train["SalaryNormalized"].mean()
    print("The mean salary is %f" % mean)

    print("Saving the model")
    data_io.save_model(mean)

    predictions = [mean] * len(train)
    print(metrics.MAE(predictions, train["SalaryNormalized"].tolist()))
Ejemplo n.º 15
0
def main():
    markdown = PagedownToHtml()

    print("Reading in the training data")
    train = data_io.get_train_df()
    for i in train.index:
        train["BodyMarkdown"][i] = markdown.convert(train["BodyMarkdown"][i])

    print("Extracting features and training")
    classifier = get_pipeline()
    classifier.fit(train, train["OpenStatus"])

    print("Saving the classifier")
    data_io.save_model(classifier, "model.pickle")
    model = data_io.load_model("model.pickle")
Ejemplo n.º 16
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load: #Last features calculated from candidates
         (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
     else:
         (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Ejemplo n.º 17
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm:  #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load:  #Last features calculated from candidates
             (namesObservations, coordinates,
              train) = Utils.calculateFeatures(fileName, featureGetter,
                                               imageCollections)
         else:
             (namesObservations, coordinates,
              train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target,
          obs) = featureGetter.getTargetVector(coordinates,
                                               namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes],
                                 namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize,
                                 target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500,
                                    verbose=2,
                                    n_jobs=1,
                                    min_samples_split=30,
                                    random_state=1,
                                    compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Ejemplo n.º 18
0
 def checkCandidates(self):
     imageCollections = data_io.get_train_df()
     featureGetter = FeatureGetter()
     (namesObservations, coordinates,
      train) = featureGetter.getTransformedDatasetChecking(imageCollections)
     imageNames = namesObservations
     currentImage = imageNames[0]
     csvArray = Utils.readcsv(imageNames[0])
     mitoticPointsDetected = 0
     totalMitoticPoints = len(csvArray)
     finalTrain = []
     for i in range(len(coordinates)):
         if imageNames[i] != currentImage:
             csvArray = Utils.readcsv(imageNames[i])
             totalMitoticPoints += len(csvArray)
             currentImage = imageNames[i]
         for point in csvArray:
             if ((point[0] - coordinates[i][0])**2 +
                 (point[1] - coordinates[i][1])**2) < 30**2:
                 mitoticPointsDetected += 1
                 csvArray.remove(point)
                 finalTrain.append(train[i])
                 break
     finalTrain = np.array(finalTrain)
     allArea = finalTrain[:, 0]
     allPerimeter = finalTrain[:, 1]
     allRoundness = finalTrain[:, 2]
     totalObservations = len(coordinates)
     print "Minimum Area: %f" % np.min(allArea)
     print "Minimum Perimeter: %f" % np.min(allPerimeter)
     print "Minimum Roundness: %f" % np.min(allRoundness)
     print "Maximum Area: %f" % np.max(allArea)
     print "Maximum Perimeter: %f" % np.max(allPerimeter)
     print "Maximum Roundness: %f" % np.max(allRoundness)
     print "Total number of candidates: %d" % (totalObservations)
     print "Total number of mitotic points: %d" % (totalMitoticPoints)
     print "Mitotic points detected: %d" % (mitoticPointsDetected)
     print "Mitotic points missed: %d" % (totalMitoticPoints -
                                          mitoticPointsDetected)
Ejemplo n.º 19
0
 def checkCandidates(self):
     imageCollections = data_io.get_train_df()
     featureGetter = FeatureGetter()
     (namesObservations, coordinates, train) = featureGetter.getTransformedDatasetChecking(imageCollections)
     imageNames = namesObservations
     currentImage = imageNames[0]
     csvArray = Utils.readcsv(imageNames[0])
     mitoticPointsDetected = 0
     totalMitoticPoints = len(csvArray)
     finalTrain = []
     for i in range(len(coordinates)):
         if imageNames[i] != currentImage:
             csvArray = Utils.readcsv(imageNames[i])
             totalMitoticPoints += len(csvArray)
             currentImage = imageNames[i]
         for point in csvArray:
             if ((point[0]-coordinates[i][0]) ** 2 + (point[1]-coordinates[i][1]) ** 2)< 30**2:
                 mitoticPointsDetected += 1
                 csvArray.remove(point)
                 finalTrain.append(train[i])
                 break
     finalTrain = np.array(finalTrain)
     allArea = finalTrain[:,0]
     allPerimeter = finalTrain[:,1]
     allRoundness = finalTrain[:,2]
     totalObservations = len(coordinates)
     print "Minimum Area: %f" % np.min(allArea)
     print "Minimum Perimeter: %f" % np.min(allPerimeter)
     print "Minimum Roundness: %f" % np.min(allRoundness)
     print "Maximum Area: %f" % np.max(allArea)
     print "Maximum Perimeter: %f" % np.max(allPerimeter)
     print "Maximum Roundness: %f" % np.max(allRoundness)
     print "Total number of candidates: %d" % (totalObservations)
     print "Total number of mitotic points: %d" %(totalMitoticPoints)
     print "Mitotic points detected: %d" %(mitoticPointsDetected)
     print "Mitotic points missed: %d" %(totalMitoticPoints-mitoticPointsDetected)
Ejemplo n.º 20
0
{
    'C': [0.001, 0.1, 1.0, 10.0, 100.0],
    'penalty': ['l1', 'l2']
}

svc_params_grid = \
{
    'C': [0.001, 0.1, 1.0, 10.0, 100.0],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.1, 1.0, 10.0, 100.0]
}

if __name__ == "__main__":

    print("Reading in the training data")
    train = data_io.get_train_df()

    print(
        "Cleaning data. Check here for imputation, One hot encoding and factorization procedures.."
    )
    train = FeatureConverter().clean_data(train)
    train.drop(['PassengerId'], axis=1, inplace=True)
    #print train.head()
    train = train.values

    grid_search = GridSearchCV(RandomForestClassifier(n_estimators=100),
                               rf_params_grid,
                               cv=5,
                               verbose=1)
    grid_search.fit(train[0:, 1:], train[0:, 0])
Ejemplo n.º 21
0
{
    'C': [0.001, 0.1, 1.0, 10.0, 100.0],
    'penalty': ['l1', 'l2']
}

svc_params_grid = \
{
    'C': [0.001, 0.1, 1.0, 10.0, 100.0],
    'kernel': ['linear', 'rbf'],
    'gamma': [0.001, 0.1, 1.0, 10.0, 100.0]
}

if __name__=="__main__":
    
    print("Reading in the training data")
    train = data_io.get_train_df()
    
    print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..")
    train = FeatureConverter().clean_data(train)
    train.drop(['PassengerId'], axis = 1, inplace = True)
    #print train.head()
    train = train.values
    
    grid_search = GridSearchCV(RandomForestClassifier(n_estimators = 100), rf_params_grid, cv = 5, verbose = 1)
    grid_search.fit(train[0:,1:],train[0:,0])
    
    print grid_search.best_params_
    
    grid_search = GridSearchCV(LogisticRegression(random_state = 0), lr_params_grid, cv = 5, verbose = 1)
    grid_search.fit(train[0:,1:],train[0:,0])