def main():
    print("Reading training data")
    train = data_io.read_train()

    train.fillna(-1, inplace=True)

    #train_sample = train.fillna(value=-2)
    #train_sample = train[:2500000].fillna(value=0)
    train_sample = train[:100000]
    #train_sample = train.fillna(value=0)

    feature_names = list(train_sample.columns)
    feature_names.remove("click_bool")
    feature_names.remove("booking_bool")
    feature_names.remove("gross_bookings_usd")
    feature_names.remove("date_time")
    feature_names.remove("position")

    features = train_sample[feature_names].values
    #train_sample["position"] *= -1.0
    #target = train_sample["position"].values
    #target = train_sample["booking_bool"].values
    target = train_sample["booking_bool"].values

    print("Training the Classifier")
    classifier = LambdaMART(n_estimators=50,
                                        verbose=2,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #2
0
def main():

    X,y = load_train_set()
    X = delete_unused_columns(X)
    #X,y = sample(X,y, 0.1)
    #X,y = selectAllCategorical(X,y)
    #print X.shape
    #exit()
#
#    import re
#    prog = re.compile(".*_[1,3]")
#    matches = [prog.match(i) is not None for i in X.index]
#    X,y = X[matches],y[matches]
    
    
    params = {'n_estimators': 3000, 'subsample': 0.6, 'random_state': 0, 'verbose':90, 'min_samples_split': 5, 'learning_rate': 0.00636406103119062, 'max_depth': 12, 'min_samples_leaf': 59}
    #params = {'n_estimators': 3000, 'subsample': 0.6, 'random_state': 0, 'verbose':90, 'min_samples_split': 5, 'learning_rate': 0.1, 'max_depth': 12, 'min_samples_leaf': 59}
    
    print params    
    score, c = cross_val(X, y, clf,params = params, n_folds = 2, shuffle = True, score_func = fit_clf, test_size = 0.10 )
  
    #bestClf = data_io.load_model();print "AUC", auc(y, bestClf.predict(X));exit(0)
    bestClf = clf(**params);bestClf.fit(X, y.Target);print "AUC", auc(y, bestClf.predict(X))

    #print_importances(X, bestClf, 1)

    print("Saving the classifier")
    data_io.save_model(bestClf)
Example #3
0
def main():
    
    classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0)
    classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0)
    classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski')
    classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10)
    classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0)
    classifier6 = GaussianNB()
    classifier7 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
          
    print("Reading in the training data")
    train = data_io.get_train_df()
    
    print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..")
    train = FeatureConverter().clean_data(train)
    train.drop(['Id'], axis = 1, inplace = True)
    #print train.head()
    train = train.values
    
    #eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard')
    #eclf = EnsembleClassifier(clfs = [classifier1], voting = 'hard')
    eclf = classifier3
    #scores = cross_val_score(estimator = eclf, X = train[0:,0:-1], y = train[0:,-1], cv = 10, scoring = 'roc_auc')
    
    #print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std()))
    eclf.fit(train[0:,0:-1],train[0:,-1])

#     importances = eclf.feature_importances_
#     indices = np.argsort(importances)[::-1]
#     for f in range(train[0:,0:-1].shape[1]):
#         print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
#         
    print("Saving the classifier")
    data_io.save_model(eclf)
Example #4
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm: #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load: #Last features calculated from candidates
             (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
         else:
             (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize, target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Example #5
0
def main():
    print("Getting features for deleted papers from the database")
    if(os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if(os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1,
                                        max_features=None)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier, prefix="forest_")
Example #6
0
def main(argv):
    n=None
    try:
      opts, args = getopt.getopt(argv,"ht:s:",["train=", "settings="])
    except getopt.GetoptError:
      print 'test.py -t <train number> -s <settings file>'
      sys.exit(2)
    for opt, arg in opts:
      if opt == '-h':
        print 'test.py -t <train number>'
        sys.exit()
      elif opt in ("-t", "--train"):
        n = int(arg)
      elif opt in ("-s", "--settings"):
        settings = arg
    print("Reading in the training data")
    train = data_io.read_train_pairs(settings)
    target = data_io.read_train_target(settings)

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier, settings)
Example #7
0
def main():
    
    classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0)
    classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0)
    classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski')
    classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10)
    classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0)
    classifier6 = GaussianNB()
        
    print("Reading in the training data")
    train = data_io.get_train_df()
    
    print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..")
    train = FeatureConverter().clean_data(train)
    train.drop(['PassengerId'], axis = 1, inplace = True)
    #print train.head()
    train = train.values
    
    eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard')
    #eclf = EnsembleClassifier(clfs = [classifier2], voting = 'hard')    
    scores = cross_val_score(estimator = eclf, X = train[0:,1:], y = train[0:,0], cv = 10, scoring = 'roc_auc')
    
    print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std()))
    eclf.fit(train[0:,1:],train[0:,0])

    print("Saving the classifier")
    data_io.save_model(eclf)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', type=int, action='store',
            dest='target', help='for validation or test dataset')
    parser.add_argument('-c1', type=int, action='store',
            dest='ucluster_num', help='cluster number of users')
    parser.add_argument('-c2', type=int, action='store',
            dest='icluster_num', help='cluster number of items')

    if len(sys.argv) != 7:
        print 'Command e.g.: python cluster.py -t 0(1) -c1 20 -c2 50'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        user_features = [entry for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE"]))]
        item_features = [entry for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE"]))]
    elif para.target == 1:
        user_features = [entry for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))]
        item_features = [entry for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))]
    else:
        print 'Invalid train data target choice...'
        sys.exit(1)
    user_features = [map(int, entry[1:]) for entry in user_features]
    item_features = [map(int, entry[1:]) for entry in item_features]

    cluster = KMeans(n_clusters=para.ucluster_num)
    cluster.fit(user_features)
    data_io.save_model(cluster, settings["USER_CLUSTER_MODEL_FILE"])

    cluster = KMeans(n_clusters=para.icluster_num)
    cluster.fit(item_features)
    data_io.save_model(cluster, settings["ITEM_CLUSTER_MODEL_FILE"])
Example #9
0
def main():
    print("Reading training data ...")
    train = data_io.read_train()
    train.fillna(0, inplace=True)

    train_sample = train.fillna(value=0)

    features = ut.preprocess(train_sample)
    target = ut.construct_target(train_sample)
    # target = train_sample["booking_bool"].values
    # save the processed data, which may be useful 
    # to test the performance of our model
    print("Saving processed training data ...")
    data_io.save_processed_data([features, target])

    print("Training the Regressor ...")
    regressor = RandomForestRegressor(n_estimators=10, #RandomForestClassifier
                                        verbose=2,
                                        n_jobs=-1,
                                        max_features = "sqrt",
                                        min_samples_split=10,
                                        random_state=1)
    regressor.fit(features, target)
    
    print("Saving the Regressor ...")
    data_io.save_model(regressor)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', type=int, action='store',
            dest='target', help='for validation or test dataset')

    if len(sys.argv) != 3:
        print 'Command e.g.: python train.py -t 0(1)'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        features_targets = [entry for entry in csv.reader(open(settings["LR_TRAIN_FILE"]))]
    elif para.target == 1:
        features_targets = [entry for entry in csv.reader(open(settings["LR_TRAIN_FILE_FOR_SUBMIT"]))]
    else:
        print 'Invalid train data target choice...'
        sys.exit(1)
    features = [map(float, entry[2:-1]) for entry in features_targets]
    targets = [map(int, entry[-1]) for entry in features_targets]

    '''classifier = GradientBoostingClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)'''
    classifier = LogisticRegression(penalty='l2',
                                    dual=False,
                                    tol=0.0001,
                                    C=1.0,
                                    fit_intercept=True,
                                    intercept_scaling=1,
                                    class_weight=None,
                                    random_state=None)
    classifier.fit(features, targets)
    data_io.save_model(classifier, settings["LR_MODEL_FILE"])
def main():
    sample_size = int(sys.argv[1])
    train = data_io.read_train()
    print("Data Size:")
    print(train.shape)
    feature_eng(train)
    ## originally sample size = 100000
    train_sample = train[:sample_size]

    ## Train the booking model
    for i in range(0,2):
        if i==0:
            model_name = "Booking"
            response_name = "booking_bool"
            isBook = True
        else:
            model_name = "Click"
            response_name = "click_bool"
            isBook = False
        print("Training the "+model_name+" Classifier...")
        tstart = datetime.now()
        feature_names = get_features(train_sample, isBook)
        print("Using "+str(len(feature_names))+" features...")
        features = train_sample[feature_names].values
        target = train_sample[response_name].values
        classifier = model.model()
        classifier.fit(features, target)
        # print the time interval
        print("Time used,")
        print datetime.now() - tstart
        print("Saving the classifier...")
        tstart = datetime.now()
        data_io.save_model(classifier, isBook)
        print("Time used,")
        print datetime.now() - tstart
Example #12
0
def Logistic_Regression_Classifier(features, target):
    print("===== LogisticRegression =====")
    print("[INFO] Training the Classifier")
    classifier = LogisticRegression(penalty='l1', dual=False, tol=0.000001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=1)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #13
0
def Gradient_Boosting_Classifier(features, target):
    print("===== GradientBoosting =====")
    print("[INFO] Training the Classifier")
    classifier = GradientBoostingClassifier(learning_rate=0.1,n_estimators=50,max_depth=5,verbose=2,min_samples_split=10,max_features=9,random_state=1)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #14
0
def main():
    print("Getting features for deleted papers from the database")
    if(os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if(os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [[0] for x in range(len(features_deleted))] + [[1] for x in range(len(features_conf))]
    
    featuresInts = []
    for tup in features:
        a, b, c, d, e = tup
        featuresInts.append((int(a), int(b), int(c), int(d), int(e)))

   
    trainSet = zip(featuresInts, target)
    

   
    N = 5          #N : number of inputs/neurons for input layer
    H1 = 100       #H : number of neurons in hidden layer-1
    #H2 = 5
    M = 1           #number of outputs/neurons of the output layer
    
    learningRate = 0.1
    epochs =  1000
    
    #define layers of MLP keeping in mind that output of one layer is the number of inputs for the next layer
    layer0 = Layer(nNeurons=N, nInpsPerNeuron=-1, transferF='identity', ilayer=0, seed=13)           #input layer
    layer1 = Layer(nNeurons=H1, nInpsPerNeuron=N, transferF='tanh', ilayer=1, seed=13)                #hidden layer 1
    layer2 = Layer(nNeurons=M, nInpsPerNeuron=H1, transferF='tanh', ilayer=2, seed=13)                #output layer 
    #layer3 = Layer(nNeurons=M, nInpsPerNeuron=H2, transferF='logistic', ilayer=3)            #output layer
    
    layers = [layer0, layer1, layer2 ]
    
    mlp = Mlp(layers)
    mlp.showMlp()
    print "\n\nTraining  Mlp for", epochs," Epochs.... please wait... "   
    trainedMlp, iterations = mlp.trainMlp(trainSet, learningRate, epochs)
    print "\n\nFinished training of Mlp "
    trainedMlp.showMlp()
    
    print("Saving the classifier")
    data_io.save_model(mlp,prefix="mlp_")
Example #15
0
def Gaussian_Process_Regression(features, target):
    print("===== GaussianProcess =====")
    print("[INFO] Training the Classifier")
    
    classifier = GaussianProcess(theta0=0.1, thetaL=0.001, thetaU=1.0)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier)
Example #16
0
def Random_Forest_Classifier(features, target):
    print("===== RandomForest =====")
    print("[INFO] Training the Classifier")
    max_f = min(9, len(features[0]))
    # TODO(nkhadke, senwu): Figure out multiprocessing error
    classifier = RandomForestClassifier(n_estimators=1000, verbose=2, n_jobs=1, max_depth=10, min_samples_split=10, max_features=max_f, random_state=1, criterion='gini', compute_importances='True')
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier)
Example #17
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    classifier = get_pipeline(train)
    classifier.fit(train, train["SalaryNormalized"])

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #18
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    for key in train:
        classifier = get_pipeline(train[key])
        classifier.fit(train[key], train[key]["SalaryNormalized"])
        print("Saving the classifier for %s" %key)
        data_io.save_model(classifier,key)
Example #19
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train[[x for x in train.columns if x != 'label']], train['label'])

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #20
0
def main():
    print("Reading in the training data")
    train, train_labels = data_io.get_train()

    print("Extracting features and training model")
    classifier = RandomForestClassifier(n_estimators = 500,
                                        min_samples_leaf = 1)
    classifier.fit(train, train_labels)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #21
0
def main():
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    target = data_io.read_train_target()

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #22
0
def main():
    print("Reading in the training data")
    train = data_io.get_train_df()

    mean = train["SalaryNormalized"].mean()
    print("The mean salary is %f" % mean)

    print("Saving the model")
    data_io.save_model(mean)

    predictions = [mean] * len(train)
    print(metrics.MAE(predictions, train["SalaryNormalized"].tolist()))
Example #23
0
def main():
    
    set1 = 'train' if len(sys.argv) < 2 else sys.argv[1]
    set2 = [] if len(sys.argv) < 3 else sys.argv[2:]
    train_filter = None
    train_filter2 = None
    
    model = MODEL(**MODEL_PARAMS)
    
    print("Reading in training data " + set1)
    train = data_io.read_data(set1)
    print("Extracting features")
    train = model.extract(train)
    print("Saving train features")
    data_io.write_data(set1, train)
    target = data_io.read_target(set1)
    
    train2 = None
    target2 = None
    for s in set2:
        print "Reading in training data", s
        tr = data_io.read_data(s)
        print "Extracting features"
        tr = model.extract(tr)
        print "Saving train features"
        data_io.write_data(s, tr)
        tg = data_io.read_target(s)
        train2 = tr if train2 is None else pd.concat((train2, tr), ignore_index=True)
        target2 = tg if target2 is None else pd.concat((target2, tg), ignore_index=True)
        train2, target2 = util.random_permutation(train2, target2)
        train_filter2  = ((train2['A type'] != 'Numerical') & (train2['B type'] == 'Numerical'))
        #train_filter2 |= ((train2['A type'] == 'Numerical') & (train2['B type'] != 'Numerical'))

    # Data selection
    train, target = util.random_permutation(train, target)
    train_filter  = ((train['A type'] != 'Numerical') & (train['B type'] == 'Numerical')) 
    #train_filter |= ((train['A type'] == 'Numerical') & (train['B type'] != 'Numerical'))

    if train_filter is not None:
        train = train[train_filter]
        target = target[train_filter]
    if train_filter2 is not None:
        train2 = train2[train_filter2]
        target2 = target2[train_filter2]

    print("Training model with optimal weights")
    X = pd.concat([train, train2]) if train2 is not None else train
    y = np.concatenate((target.Target.values, target2.Target.values)) if target2 is not None else target.Target.values  
    model.fit(X, y) 
    model_path = "cnmodel.pkl"
    print "Saving model", model_path
    data_io.save_model(model, model_path)
def main():

    #sample_size = int(sys.argv[1])
    ## sample_size = int(1000)

    # read train.csv
    train = pd.read_csv(data_io.data_path + "train_set.csv", index_col=False, header=None)
    train.columns = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'expense', 'count']
    print "Data Size:", (train.shape)

    # feature engineering
    #feature_eng(train)

    ## originally sample size = 100000
    train = train[:300000]

    #
    # book_trainset = train_set[train_set['booking_bool']==1]
    # book_rows = book_trainset.index.tolist()
    # bsize = len(book_trainset.index)
    # click_trainset = train_set[train_set['click_bool']==1]
    # click_rows = click_trainset.index.tolist()
    # csize = len(click_trainset.index)
    # print 'bsize ' + str(bsize)
    # print 'csize ' + str(csize)
    # book_trainset = book_trainset.append(train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)])
    # click_trainset =click_trainset.append(train_set.ix[random.sample(train_set.drop(click_rows).index, csize)])

    #book_trainset = train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)]


    model_name = "predict_model"
    response_name = 'count'
    feature_names = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'expense'] #get_features()

    print "Training the " + model_name + " Classifier..."
    print "Using " + str(len(feature_names)) + " features..."
    tstart = datetime.now()

    features = train[feature_names].values
    target = train[response_name].values

    classifier = model.model()
    classifier.fit(features, target)

    print "Time used,", datetime.now() - tstart

    print "Saving the classifier..."
    tstart = datetime.now()
    data_io.save_model(classifier, model_name)
    print "Time used,", datetime.now() - tstart
def main():
    features_targets = [entry for entry in csv.reader(open(settings["GBT_TRAIN_FILE_FOR_SUBMIT"]))]

    features = [map(float, entry[:-1]) for entry in features_targets]
    targets = [map(int, entry[-1]) for entry in features_targets]
    features_targets = []

    classifier = RandomForestClassifier(n_estimators=200,
                                        verbose=2,
                                        n_jobs=4,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, targets)
    data_io.save_model(classifier)
Example #26
0
def main():
    markdown = PagedownToHtml()

    print("Reading in the training data")
    train = data_io.get_train_df()
    for i in train.index:
        train["BodyMarkdown"][i] = markdown.convert(train["BodyMarkdown"][i])

    print("Extracting features and training")
    classifier = get_pipeline()
    classifier.fit(train, train["OpenStatus"])

    print("Saving the classifier")
    data_io.save_model(classifier, "model.pickle")
    model = data_io.load_model("model.pickle")
def main():
    print("Reading in the training data")
    train = data_io.read_train()

    print("Reading in the meta data")
    paper_author, paper_author_indexed = f.get_paper_author()

    print("Computing Relational Information")
    computed_features = f.get_all_computed_features(paper_author)

    print("Extracting features")
    features = []
    target = []
    for author_id, row in train.iterrows():
        for paper_id in row["DeletedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                target.append(1)
                features.append(s)
        for paper_id in row["ConfirmedPaperIds"]:
            s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features)
            if s is None:
                print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id))
            else:
                target.append(0)
                features.append(s)

    print("Target Length: %d" % len(target))
    print("Feature Length: %d" % len(features))

    feature_matrix = pd.DataFrame(features)

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    try:
        classifier.fit(feature_matrix, target)
    except:
        import pdb;pdb.set_trace()

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #28
0
def main():
    sample_size = int(sys.argv[1])
    ## sample_size = int(1000)
    train = data_io.read_train()
    print("Data Size:")
    print(train.shape)
    feature_eng(train)
    ## originally sample size = 100000
    train_set = train[:sample_size]
    book_trainset = train_set[train_set['booking_bool']==1]
    book_rows = book_trainset.index.tolist()
    bsize = len(book_trainset.index)
    click_trainset = train_set[train_set['click_bool']==1]
    click_rows = click_trainset.index.tolist()
    csize = len(click_trainset.index)
    print 'bsize ' + str(bsize)
    print 'csize ' + str(csize)
    book_trainset = book_trainset.append(train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)])
    click_trainset =click_trainset.append(train_set.ix[random.sample(train_set.drop(click_rows).index, csize)])
    ## Train the booking model
    for i in range(0,2):
        if i==0:
            model_name = "Booking"
            response_name = "booking_bool"
            train_sample = book_trainset
            isBook = True
        else:
            model_name = "Click"
            response_name = "click_bool"
            train_sample = click_trainset
            isBook = False
        print("Training the "+model_name+" Classifier...")
        tstart = datetime.now()
        feature_names = get_features(train_sample, isBook)
        print("Using "+str(len(feature_names))+" features...")
        features = train_sample[feature_names].values
        target = train_sample[response_name].values
        classifier = model.model()
        classifier.fit(features, target)
        # print the time interval
        print("Time used,")
        print datetime.now() - tstart
        print("Saving the classifier...")
        tstart = datetime.now()
        data_io.save_model(classifier, isBook)
        print("Time used,")
        print datetime.now() - tstart
def main():
    print("Reading training data")
    train_chunks = data_io.read_train_features()
    train = pandas.concat([chunk for chunk in train_chunks], ignore_index=True)
    
    print("Training five Classifier")
    fiveClassifier = classify(train, "five")

    print("Training one Classifier")
    oneClassifier = classify(train, "one")

    print("Training zero Classifier")
    zeroClassifier = classify(train, "zero")

    classifier = (fiveClassifier, oneClassifier, zeroClassifier)
    
    print("Saving the classifiers")
    data_io.save_model(classifier)
def Random_Forest_Classifier(features, target):
    print("===== RandomForest =====")
    print("[INFO] Training the Classifier")
    max_f = min(9, len(features[0]))
    # TODO(nkhadke, senwu): Figure out multiprocessing error
    classifier = RandomForestClassifier(n_estimators=1000,
                                        verbose=2,
                                        n_jobs=1,
                                        max_depth=10,
                                        min_samples_split=10,
                                        max_features=max_f,
                                        random_state=1,
                                        criterion='gini',
                                        compute_importances='True')
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #31
0
def main():
    comm = MPI.COMM_WORLD
    size = comm.Get_size()   
    rank = comm.Get_rank()
    conn = data_io.get_db_conn()
    feature_name = open("feature_list.txt").read().split()
    # if size < len(feature_name):	# to be done!
    for table_name in ["TrainDeleted", "TrainConfirmed"]:
	if rank > 0:
            # getting features by parallel computing
	    print "getting features at node " + str(rank)
            feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1])
	else:
	    feature = data_io_parallel.get_trained_validation_data(conn, table_name)
	    
	# sending features to rank 0
	print "sending features to node " + str(rank)
	features = comm.gather(feature, root = 0)
        #print features
	if rank == 0:	  
	    temp = []
	    for f in features:
		temp.extend(f)  	    
	    print "Successfully got the features from " + table_name
	    if table_name == "TrainDeleted":
		features_deleted = map(list, np.array(temp).T)
	    else:
		features_conf = map(list, np.array(temp).T)

    if rank == 0:
	features = [x[2:] for x in features_deleted + features_conf]
	target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]
    	print("Training the Classifier")
    	classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    	classifier.fit(features, target)
    
    	print("Saving the classifier")
    	data_io.save_model(classifier)
        print "Training completed, exit..."
        comm.Abort()
Example #32
0
 def runWithoutWndchrm(self):
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     print "Getting features"
     featureGetter = FeatureGetter()
     fileName = data_io.get_savez_name()
     if not self.load: #Last features calculated from candidates
         (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections)
     else:
         (namesObservations, coordinates, train) = Utils.loadFeatures(fileName)
     print "Getting target vector"
     (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train)
     print "Training the model"
     classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True)
     #classifier = KNeighborsClassifier(n_neighbors=50)
     model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)])
     model.fit(obs[indexes], target[indexes])
     print "Saving the classifier"
     data_io.save_model(model)
Example #33
0
 def run(self):
     print "Preparing the environment"
     self.prepareEnvironment()
     print "Reading in the training data"
     imageCollections = data_io.get_train_df()
     wndchrmWorker = WndchrmWorkerTrain()
     print "Getting features"
     if not self.loadWndchrm:  #Last wndchrm set of features
         featureGetter = FeatureGetter()
         fileName = data_io.get_savez_name()
         if not self.load:  #Last features calculated from candidates
             (namesObservations, coordinates,
              train) = Utils.calculateFeatures(fileName, featureGetter,
                                               imageCollections)
         else:
             (namesObservations, coordinates,
              train) = Utils.loadFeatures(fileName)
         print "Getting target vector"
         (indexes, target,
          obs) = featureGetter.getTargetVector(coordinates,
                                               namesObservations, train)
         print "Saving images"
         imageSaver = ImageSaver(coordinates[indexes],
                                 namesObservations[indexes],
                                 imageCollections, featureGetter.patchSize,
                                 target[indexes])
         imageSaver.saveImages()
         print "Executing wndchrm algorithm and extracting features"
         (train, target) = wndchrmWorker.executeWndchrm()
     else:
         (train, target) = wndchrmWorker.loadWndchrmFeatures()
     print "Training the model"
     model = RandomForestClassifier(n_estimators=500,
                                    verbose=2,
                                    n_jobs=1,
                                    min_samples_split=30,
                                    random_state=1,
                                    compute_importances=True)
     model.fit(train, target)
     print model.feature_importances_
     print "Saving the classifier"
     data_io.save_model(model)
Example #34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        type=int,
                        action='store',
                        dest='target',
                        help='for validation or test dataset')

    if len(sys.argv) != 3:
        print 'Command e.g.: python train.py -t 0(1)'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        features_targets = [
            entry for entry in csv.reader(open(settings["LR_TRAIN_FILE"]))
        ]
    elif para.target == 1:
        features_targets = [
            entry
            for entry in csv.reader(open(settings["LR_TRAIN_FILE_FOR_SUBMIT"]))
        ]
    else:
        print 'Invalid train data target choice...'
        sys.exit(1)
    features = [map(float, entry[2:-1]) for entry in features_targets]
    targets = [map(int, entry[-1]) for entry in features_targets]
    '''classifier = GradientBoostingClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)'''
    classifier = LogisticRegression(penalty='l2',
                                    dual=False,
                                    tol=0.0001,
                                    C=1.0,
                                    fit_intercept=True,
                                    intercept_scaling=1,
                                    class_weight=None,
                                    random_state=None)
    classifier.fit(features, targets)
    data_io.save_model(classifier, settings["LR_MODEL_FILE"])
Example #35
0
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()
    print train

    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(train, target.Target)
    
    features = [x[0] for x in classifier.steps[0][1].features ]

    csv_fea = csv.writer(open('features.csv','wb'))
    imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    for fea in imp:
        print fea[0], fea[1]
        csv_fea.writerow([fea[0],fea[1]])

    
    oob_score =  classifier.steps[1][1].oob_score_
    print "oob score:", oob_score
    logger = open("run_log.txt","a")
    if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n")
    else:logger.write("\n" + str(oob_score[0]) + "\n")

    print("Saving the classifier")
    data_io.save_model(classifier)
   
    print("Predicting the train set")
    train_predict = classifier.predict(train)
    trian_predict = train_predict.flatten()
    data_io.write_submission(train_predict, 'train_set', run = 'train')

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff/60,1)
def main():    
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=50, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier)
Example #37
0
def main():

    print("Reading in the raw data of features and salaries for merging")
    train_f = data_io.get_train_f_df()
    train_s = data_io.get_train_s_df()
    #train_f: training feature data; train_s: training salary data with 0 items deleted 

    """
    train_f.describe
    train_s.describe
    """
    #merge the data by jobId, similar to SQL join
    data = pd.merge(train_f,train_s,how='left')
    data.to_csv("D:/job/indeed_data_science_exercise/RFC1/train9merge2.csv", sep=',',encoding='utf-8')

    # seperate the data into features set of the feature columns and the set with target column salary only
    #'companyId' excluded
    characters = ["jobType", "degree", "major", "industry", "yearsExperience", "milesFromMetropolis"]
    x = data[characters]
    y = data[['salary']]
         
    print("Extracting features and training model")
    classifier = get_pipeline()
    classifier.fit(xtr, ytr)

    print("Saving the classifier")
    data_io.save_model(classifier)
    
    print("Load testing data") 
    testin = data_io.get_test_df()
    test=testin[characters]

    print("Making predictions") 
    predictions = classifier.predict(test)   
    predictions = predictions.reshape(len(predictions), 1)

    #classifier.get_params
    #pred_score=explained_variance_score(ycv, predictions, multioutput='raw_values')

    print("Writing predictions to file")
    write_submission(predictions)
Example #38
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t',
                        type=int,
                        action='store',
                        dest='target',
                        help='for validation or test dataset')

    if len(sys.argv) != 3:
        print 'Command e.g.: python train.py -t 0(1)'
        sys.exit(1)

    para = parser.parse_args()
    if para.target == 0:
        features_targets = [
            entry for entry in csv.reader(open(settings["MTLR_TRAIN_FILE"]))
        ]
    elif para.target == 1:
        features_targets = [
            entry for entry in csv.reader(
                open(settings["MTLR_TRAIN_FILE_FOR_SUBMIT"]))
        ]
    else:
        print 'Invalid train data target choice...'
        sys.exit(1)
    features = [map(float, entry[2:-1]) for entry in features_targets]
    pairs = [map(int, entry[:2]) for entry in features_targets]
    targets = [map(int, entry[-1]) for entry in features_targets]

    classifier = MeanRegularizedMultiTaskLR(C=0.1,
                                            tol=0.0001,
                                            intercept_scaling=1,
                                            lr=0.02,
                                            eta=0.1,
                                            field_for_model_num=2,
                                            max_niters=200,
                                            confidence=20,
                                            para_init="gaussian")
    classifier.fit(pairs, features, targets)
    data_io.save_model(classifier, settings["MTLR_MODEL_FILE"])
Example #39
0
def training(processed_train_csv_file):
    processed_train_samples = pd.read_csv(processed_train_csv_file)
    processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan)
    processed_train_samples = processed_train_samples.fillna(value=0)
    processed_train_samples_index_lst = processed_train_samples.index.tolist()
    random.shuffle(processed_train_samples_index_lst)
    shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst]
    col_names = shuffled_train_samples.columns.tolist()
    col_names.remove("booking_bool")
    features = shuffled_train_samples[col_names].values
    labels = shuffled_train_samples["booking_bool"].values

    print "Training Random Forest Classifier"
    rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    rf_classifier.fit(features, labels)
    print "Saving the Random Forest Classifier"
    data_io.save_model(rf_classifier, model_name="rf_classifier.pkl")

    print "Training Gradient Boosting Classifier"
    gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10)
    gb_classifier.fit(features, labels)
    print "Saving the Gradient Boosting Classifier"
    data_io.save_model(gb_classifier, model_name="gb_classifier.pkl")

    print "Training SGD Classifier"
    sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1)
    sgd_classifier.fit(features, labels)
    print "Saving the SGD Classifier"
    data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
Example #40
0
def main():
    print("Getting features for deleted papers from the database")
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1,
                                        max_features=None)
    classifier.fit(features, target)

    print("Saving the classifier")
    data_io.save_model(classifier, prefix="forest_")
Example #41
0
def main():
    print("Getting features for each bird-class ")
    if(os.path.exists("features.obj")):
        with open("features.obj", 'r') as loadfile:
            features = cPickle.load(loadfile)
    else:
        features = data_io.get_features_csv()
        with open("features.obj", 'w') as dumpfile:
            cPickle.dump(features, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    target = []

    print("Training the Classifier")
    classifier = RandomForestClassifier(n_estimators=100, 
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1,
                                        max_features=None)
    classifier.fit(features, target)
    
    print("Saving the classifier")
    data_io.save_model(classifier, prefix="forest_")
Example #42
0
 def run(self):
     features = f.features
     train = self.getTrainingDataset()
     print "Reading preprocessed features"
     if f.preprocessedFeatures != []:
         intermediate = data_io.read_intermediate_train()
         for i in f.preprocessedFeatures:
             train[i] = intermediate[i]
         for i in features:
             if i[0] in f.preprocessedFeatures:
                 i[1] = i[0]
                 i[2] = f.SimpleTransform(transformer=f.ff.identity)
     print "Reading targets"
     target = data_io.read_train_target()
     print "Extracting features and training model"
     classifier = self.getPipeline(features)
     if self.directionForward:
         finalTarget = [x * (x + 1) / 2 for x in target.Target]
     else:
         finalTarget = [-x * (x - 1) / 2 for x in target.Target]
     classifier.fit(train, finalTarget)
     print classifier.steps[-1][1].feature_importances_
     print "Saving the classifier"
     data_io.save_model(classifier)
Example #43
0
                                        verbose=2,
                                        n_jobs=1,
                                        oob_score=False,
                                        min_samples_split=2,
                                        random_state=3465343)
        classifier = AdaBoostRegressor(base_estimator=base_clf,
                                       n_estimators=n_trees,
                                       random_state=3465343)

        classifier.fit(features, salaries)
        predictions = classifier.predict(validation_features)
        print valid_salaries[1:10]
        print np.exp(predictions[1:10])
        mae = mean_absolute_error(valid_salaries, np.exp(predictions))
        print "MAE validation: ", mae
        save_model(classifier, name, mae)
        #joblib.dump(predictions, path_join(prediction_dir, name + "_prediction_valid"))
        #oob_predictions = classifier.oob_prediction_
        #mae_oob = mean_absolute_error(salaries, oob_predictions)
        #print "MAE OOB: ", mae_oob
        base_clf = ExtraTreesRegressor(n_estimators=10,
                                        verbose=2,
                                        n_jobs=1,
                                        oob_score=False,
                                        min_samples_split=2,
                                        random_state=3465343)
        classifier = AdaBoostRegressor(base_estimator=base_clf,
                                       n_estimators=n_trees,
                                       random_state=3465343)
        scores = cross_val_score(classifier, features, salaries, cv=3, score_func=log_mean_absolute_error, verbose=1)
        print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
Example #44
0
 def write_output(self, model, file):
     data_io.save_model(model, file)
Example #45
0
def main():
    '''
    print("Getting features for deleted papers from the database")
    features_deleted = data_io.get_features_db("TrainDeleted")

    print("Getting features for confirmed papers from the database")
    features_conf = data_io.get_features_db("TrainConfirmed")
    '''

    features_deleted = pickle.load(
        open(data_io.get_paths()["deleted_features"], 'rb'))
    features_conf = pickle.load(
        open(data_io.get_paths()["confirmed_features"], 'rb'))

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    print("Training the Classifier")
    features = np.array(features)
    target = np.array(target)
    '''
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1)
    classifier.fit(features, target)
    '''

    #Referred https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ for parameter tuning

    param_test1 = {'max_depth': [19], 'min_child_weight': [1]}

    param_test2 = {'gamma': [i / 10.0 for i in range(0, 5)]}

    param_test3 = {
        'subsample': [i / 10.0 for i in range(6, 10)],
        'colsample_bytree': [i / 10.0 for i in range(6, 10)]
    }
    '''
    gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=19,
                                                    min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                                                    objective='binary:logistic', scale_pos_weight=1,
                                                    seed=27), param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5)
    gsearch1.fit(features, target)
    print(gsearch1.grid_scores_)
    print(gsearch1.best_params_)
    print(gsearch1.best_score_)
    exit()
    '''
    '''
    classifier = xgb.XGBClassifier(learning_rate=0.03, n_estimators=300, max_depth=19,
                                                    min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9,
                                                    objective='binary:logistic', seed=27).fit(features, target)
    '''
    '''
    classifier = RandomForestClassifier(n_estimators=50,
                                        verbose=2,
                                        n_jobs=1,
                                        min_samples_split=10,
                                        random_state=1).fit(features, target)
    '''
    '''
    print(len(features))
    a = np.random.permutation(len(features))[0:10000]
    features = features[a]
    target = target[a]
    classifier = svm.SVC(probability=True).fit(features, target)
    '''

    #classifier = GaussianNB().fit(features, target)

    classifier = xgb.XGBClassifier(max_depth=5,
                                   n_estimators=300,
                                   learning_rate=0.05,
                                   objective="binary:logistic").fit(
                                       features, target)

    print("Saving the classifier")
    data_io.save_model(classifier)

    # accuracy 0.9729 for valid set
    #classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit(features, target)
    ''' accuracy 0.9723 for valid set
Example #46
0
if __name__ == "__main__":
    print("Reading in the training data")
    train_raw = data_io.read_train_pairs()
    target = data_io.read_train_target()
    info = data_io.read_train_info()
    info['iindex'] = range(4050)

    train = train_raw.join(info)

    classifier = get_pipeline()

    ### FOLDS CODE
    #    folds = cval.KFold(len(train), n_folds=2, indices=False)
    #
    #    results = []
    #    for i, fold in enumerate(folds):
    #        print("Extracting features and training model for fold " + str(i))
    #        traincv, testcv = fold
    #        classifier.fit(train[traincv], target[traincv])
    #        results.append(classifier.score(train[testcv], target[testcv]))
    #
    #    print(results)
    #    print('Score: ' + str(np.array(results).mean()))
    ###  REGULAR RUN

    classifier.fit(train, target.Target)

    print("Saving the classifier")
    data_io.save_model(classifier)
Example #47
0
        #print "Feature ranking:"

        #for f in xrange(len(indices)):
        #print "%d. feature %d %s (%f)" % (f + 1, indices[f], names[indices[f]], importances[indices[f]])

        ## Plot the feature importances of the forest
        #import pylab as pl
        #pl.figure()
        #pl.title("Feature importances")
        #pl.bar(xrange(len(indices)), importances[indices],
        #color="r", align="center")
        #pl.xticks(xrange(len(indices)), indices)
        #pl.xlim([-1, len(indices)])
        #pl.show()
        #a=5/0
        save_model(classifier, name, mae)
        #joblib.dump(predictions, path_join(prediction_dir, name + "_prediction_valid"))
        #kmeans = KMeans(
        #random_state=3465343,
        #n_clusters=n_clusters,
        #n_jobs=-1,
        #verbose=0)
        #classifier = Pipeline(steps=[('knn', kmeans), ('tree', DecisionTreeRegressor())])
        #classifier = DecisionTreeRegressor(max_depth=3, random_state=3465343)
        #classifier = ExtraTreesRegressor(n_estimators=n_trees,
        #verbose=1,
        #n_jobs=-1,
        #oob_score=False,
        #min_samples_split=min_samples_split,
        #random_state=3465343)
        clf = SGDRegressor(random_state=3465343, verbose=0, n_iter=n_trees)
Example #48
0
 def write_output(self, model, file):
     #save the output
     data_io.save_model(model, file)
def main():
    t1 = time()
    print("Reading in the training data")
    train = data_io.read_train_pairs()
    train_info = data_io.read_train_info()
    train = combine_types(train, train_info)

    #make function later
    train = get_types(train)
    target = data_io.read_train_target()

    print "Reading SUP data..."
    for i in range(1, 4):
        print "SUP", str(i)
        sup = data_io.read_sup_pairs(i)
        sup_info = data_io.read_sup_info(i)
        sup = combine_types(sup, sup_info)
        sup = get_types(sup)
        sup_target = data_io.read_sup_target(i)
        train_info = train_info.append(sup_info)
        train = train.append(sup)
        target = target.append(sup_target)

    # Old train
    print "Reading old train data..."
    old_train = data_io.read_old_train_pairs()
    old_train_info = data_io.read_old_train_info()
    old_train = combine_types(old_train, old_train_info)
    old_train = get_types(old_train)
    old_target = data_io.read_old_train_target()

    train = train.append(old_train)
    target = target.append(old_target)
    # End old train

    print "Train size = ", str(train.shape)
    print("Extracting features and training model")
    feature_trans = fe.feature_extractor()
    orig_train = feature_trans.fit_transform(train)
    orig_train = numpy.nan_to_num(orig_train)

    classifier = classify_catagory(orig_train, target.Target)
    #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info)

    print("Saving features")
    data_io.save_features(orig_train)

    print("Saving the classifier")
    #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) )
    data_io.save_model(classifier)

    #features = [x[0] for x in classifier.steps[0][1].features ]

    #csv_fea = csv.writer(open('features.csv','wb'))
    #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True)
    #for fea in imp:
    #    print fea[0], fea[1]
    #    csv_fea.writerow([fea[0],fea[1]])

    t2 = time()
    t_diff = t2 - t1
    print "Time Taken (min):", round(t_diff / 60, 1)
Example #50
0
def test_mlp(learning_rate=0.013, L1_reg=0.00, L2_reg=0.0003, n_epochs=300,
                          n_hidden=100):
        """
    
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient
    
        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)
    
        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
    
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
    
    
       """
        np.random.seed(0)  
        print("Getting features for bird classes")
        if(os.path.exists("features.obj")):
            with open("features.obj", 'r') as loadfile:
                features, target = cPickle.load(loadfile)
        else:
            features, target = data_io.get_features_mat()
            with open("features.obj", 'w') as dumpfile:
                cPickle.dump((features, target), dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)
    
            
        datasets = load_data(features, target)#gen_data2()
    
        train_set_x, train_set_y = datasets[0]
        valid_set_x, valid_set_y = datasets[1]
        test_set_x, test_set_y = datasets[2]
        
    
        batch_size = 16    # size of the minibatch

        # compute number of minibatches for training, validation and testing
        n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
        n_test_batches  = test_set_x.get_value(borrow=True).shape[0]  / batch_size
    
        ######################
        # BUILD ACTUAL MODEL #
        ######################
        print '... building the model'
    
        # allocate symbolic variables for the data
        
        index = T.lscalar()
        x = T.matrix('x', dtype='float64')  # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix
        y = T.ivector('y')  # the labels are presented as 1D vector of
    
                            # [int] labels
    
        rng = np.random.RandomState(113)
        
        # construct the MLP class
        classifier = MLP(rng=rng, input=x, n_in=features.shape[1],
                         n_hidden=n_hidden, n_out=35)
    
        # the cost we minimize during training is the negative log likelihood of
        # the model plus the regularization terms (L1 and L2); cost is expressed
        # here symbolically
        cost = classifier.negative_log_likelihood(y) \
             + L1_reg * classifier.L1 \
             + L2_reg * classifier.L2_sqr
        
    
        # compiling a Theano function that computes the mistakes that are made
        # by the model on a minibatch
#        test_model = theano.function(inputs=[size],
#                outputs=[classifier.errors(y),classifier.getPredictions()],
#                 givens={
#                    x: test_set_x[0:size],
#                    y: test_set_y[0:size]}
#                )
#    
#        validate_model = theano.function(inputs=[size],
#                outputs=[classifier.errors(y),classifier.getPredictions()],
#                 givens={
#                    x:valid_set_x[0:size],
#                    y:valid_set_y[0:size]}
#                )

        test_model = theano.function(inputs=[index],
                outputs=classifier.errors(y),
                givens={
                    x: test_set_x[index * batch_size: (index + 1) * batch_size],
                    y: test_set_y[index * batch_size: (index + 1) * batch_size]})
    
        validate_model = theano.function(inputs=[index],
                outputs=classifier.errors(y),
                givens={
                    x: valid_set_x[index * batch_size:(index + 1) * batch_size],
                    y: valid_set_y[index * batch_size:(index + 1) * batch_size]})
        
#        predict_model = theano.function(inputs=[],
#                outputs=classifier.predictions(),
#                givens={
#                    x: predict_set_x})
    
        # compute the gradient of cost with respect to theta (sotred in params)
        # the resulting gradients will be stored in a list gparams
        gparams = []
        for param in classifier.params:
            gparam = T.grad(cost, param)
            gparams.append(gparam)
    
        # specify how to update the parameters of the model as a dictionary
        updates = {}
        # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
        # same length, zip generates a list C of same size, where each element
        # is a pair formed from the two lists :
        #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
        for param, gparam in zip(classifier.params, gparams):
            updates[param] = param - learning_rate * gparam
    
        # compiling a Theano function `train_model` that returns the cost, but
        # in the same time updates the parameter of the model based on the rules
        # defined in `updates`
#        train_model = theano.function(inputs=[size],
#                                      outputs=cost,
#                updates=updates,
#                givens={
#                    x: train_set_x[0:size],
#                    y: train_set_y[0:size]}
#               )
        train_model = theano.function(inputs=[index],
            outputs=cost,
            updates=updates,
            givens={
                x: train_set_x[index * batch_size:(index + 1) * batch_size],
                y: train_set_y[index * batch_size:(index + 1) * batch_size]})
    
        ###############
        # TRAIN MODEL #
        ###############
        print '... training'
        

        # early-stopping parameters
        patience = 1000000  # look as this many examples regardless
        patience_increase = 2  # wait this much longer when a new best is
                               # found
        improvement_threshold = 0.0995  # a relative improvement of this much is
                                       # considered significant
        validation_frequency = min(n_train_batches, patience / 2)
                                      # go through this many
                                      # minibatche before checking the network
                                      # on the validation set; in this case we
                                      # check every epoch
    
        best_params = None
        best_validation_loss = np.inf
        best_iter = 0
        test_score = 0.
        start_time = time.clock()
    
        epoch = 0
        done_looping = False
    
        
        
        
        while (epoch < n_epochs) and (not done_looping):
            #datasets = load_data(featuresMat, targetInts)#permute data()
#    
#            train_set_x, train_set_y = datasets[0]
#            valid_set_x, valid_set_y = datasets[1]
#            test_set_x, test_set_y = datasets[2]
            epoch = epoch + 1
            training_cost = []
            for minibatch_index in xrange(n_train_batches):
                minibatch_avg_cost = train_model(minibatch_index)
                training_cost.append(minibatch_avg_cost)
                # iteration number
                iter = (epoch - 1) * n_train_batches + minibatch_index
    
                if (iter + 1) % validation_frequency == 0:
                    # compute zero-one loss on validation set
                    validation_losses = [validate_model(i) for i
                                         in xrange(n_valid_batches)]
                    this_validation_loss = np.mean(validation_losses)
    
#                    print('epoch %i, minibatch %i/%i, validation error %f %%' %
#                         (epoch, minibatch_index + 1, n_train_batches,
#                          this_validation_loss * 100.))
    
                    # if we got the best validation score until now
                    if this_validation_loss < best_validation_loss:
                        #improve patience if loss improvement is good enough
                        if this_validation_loss < best_validation_loss *  \
                               improvement_threshold:
                            patience = max(patience, iter * patience_increase)
    
                        best_validation_loss = this_validation_loss
                        best_iter = iter
                        best_params = []
                        best_params.append(classifier.params)
    
                        # test it on the test set
                        test_losses = [test_model(i) for i
                                       in xrange(n_test_batches)]
                        test_score = np.mean(test_losses)
    
#                        print(('     epoch %i, minibatch %i/%i, test error of '
#                               'best model %f %%') %
#                              (epoch, minibatch_index + 1, n_train_batches,
#                               test_score * 100.))
#    
                mean_cost = np.mean(training_cost)
                if(mean_cost < 0.0005):
                    done_looping = True
                    print "training cost: ", mean_cost
                    break
            print "Epoch ", epoch," training cost: ", mean_cost
       
    
        end_time = time.clock()
        print(('Optimization complete. Best validation score of %f %% '
               'obtained at iteration %i, with test performance %f %%') %
              (best_validation_loss * 100., best_iter + 1, test_score * 100.))
        print >> sys.stderr, ('The code for file ' +
                              os.path.split(__file__)[1] +
                              ' ran for %.2fm' % ((end_time - start_time) / 60.))
        
        print("Saving the mlp best params")
        data_io.save_model(best_params, prefix="theano_")
Example #51
0
def test_mlp(learning_rate=0.013,
             L1_reg=0.0001,
             L2_reg=0.0003,
             n_epochs=10000,
             n_hidden=50,
             n_hidden2=10):
    """
    
        :type learning_rate: float
        :param learning_rate: learning rate used (factor for the stochastic
        gradient
    
        :type L1_reg: float
        :param L1_reg: L1-norm's weight when added to the cost (see
        regularization)
    
        :type L2_reg: float
        :param L2_reg: L2-norm's weight when added to the cost (see
        regularization)
    
        :type n_epochs: int
        :param n_epochs: maximal number of epochs to run the optimizer
    
    
       """
    np.random.seed(17)
    print("Getting features for deleted papers from the database")
    features_deleted = None
    features_conf = None
    if (os.path.exists("features_deleted.obj")):
        with open("features_deleted.obj", 'r') as loadfile:
            features_deleted = cPickle.load(loadfile)
    else:
        features_deleted = data_io.get_features_db("TrainDeleted")
        with open("features_deleted.obj", 'w') as dumpfile:
            cPickle.dump(features_deleted,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for confirmed papers from the database")
    if (os.path.exists("features_confirmed.obj")):
        with open("features_confirmed.obj", 'r') as loadfile:
            features_conf = cPickle.load(loadfile)
    else:
        features_conf = data_io.get_features_db("TrainConfirmed")
        with open("features_confirmed.obj", 'w') as dumpfile:
            cPickle.dump(features_conf,
                         dumpfile,
                         protocol=cPickle.HIGHEST_PROTOCOL)

    print("Getting features for valid papers from the database")
    if (os.path.exists("features_valid.obj")):
        with open("features_valid.obj", 'r') as loadfile:
            data = cPickle.load(loadfile)
    else:
        data = data_io.get_features_db("ValidPaper")
        with open("features_valid.obj", 'w') as dumpfile:
            cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL)

    author_paper_ids = [x[:2] for x in data]
    features_valid = [x[2:] for x in data]

    features_validnp = np.array(features_valid, dtype='float64')

    #        predictInts = []
    #        for tup in features_valid:
    #           a, b, c, d, e = tup
    #           predictInts.append((int(a), int(b), int(c), int(d), int(e)))
    #
    #        predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32')
    #        for i, tup in enumerate(predictInts):
    #            a, b, c, d, e = tup
    #            predictsMat[i, 0] = a;  predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e;
    predict_set_x = theano.shared(features_validnp, borrow=True)

    features = [x[2:] for x in features_deleted + features_conf]
    target = [0 for x in range(len(features_deleted))
              ] + [1 for x in range(len(features_conf))]

    featuresnp = np.array(features, dtype='float64')
    targetnp = np.array(target, dtype='int32')

    featuresnp -= np.mean(featuresnp, axis=0)
    featuresnp /= np.std(featuresnp, axis=0)

    cv = cross_validation.ShuffleSplit(len(features),
                                       n_iter=1,
                                       test_size=0.25,
                                       random_state=0)
    for train, test in cv:
        train_set_x = theano.shared(featuresnp[train], borrow=True)
        test_set_x = theano.shared(featuresnp[test], borrow=True)
        train_set_y = theano.shared(targetnp[train], borrow=True)
        test_set_y = theano.shared(targetnp[test], borrow=True)

    batch_size = 20  # size of the minibatch

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size
    #        n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size
    n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    # allocate symbolic variables for the data

    #        size = T.lscalar()
    index = T.lscalar()
    x = T.matrix(
        'x', dtype='float64'
    )  # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix
    y = T.ivector('y')  # the labels are presented as 1D vector of

    # [int] labels

    rng = np.random.RandomState(113)

    # construct the MLP class
    classifier = MLP(rng=rng,
                     input=x,
                     n_in=featuresnp.shape[1],
                     n_hidden=n_hidden,
                     n_out=2,
                     n_hidden2=10)

    cost = classifier.negative_log_likelihood(y) \
         + L1_reg * classifier.L1 \
         + L2_reg * classifier.L2_sqr

    test_model = theano.function(
        inputs=[index],
        outputs=classifier.errors(y),
        givens={
            x: test_set_x[index * batch_size:(index + 1) * batch_size],
            y: test_set_y[index * batch_size:(index + 1) * batch_size]
        })

    predict_model = theano.function(inputs=[],
                                    outputs=classifier.predictions(),
                                    givens={x: predict_set_x})

    # compute the gradient of cost with respect to theta (sotred in params)
    # the resulting gradients will be stored in a list gparams
    gparams = []
    for param in classifier.params:
        gparam = T.grad(cost, param)
        gparams.append(gparam)

    # specify how to update the parameters of the model as a dictionary
    updates = OrderedDict()
    # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of
    # same length, zip generates a list C of same size, where each element
    # is a pair formed from the two lists :
    #    C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)]
    for param, gparam in zip(classifier.params, gparams):
        updates[param] = param - learning_rate * gparam

    train_model = theano.function(
        inputs=[index],
        outputs=cost,
        updates=updates,
        givens={
            x: train_set_x[index * batch_size:(index + 1) * batch_size],
            y: train_set_y[index * batch_size:(index + 1) * batch_size]
        })

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'

    # early-stopping parameters
    patience = 1000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.0995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = np.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()

    epoch = 0
    done_looping = False

    best_params = None
    while (epoch < n_epochs) and (not done_looping):

        epoch = epoch + 1
        training_cost = []
        for minibatch_index in xrange(n_train_batches):
            minibatch_avg_cost = train_model(minibatch_index)
            training_cost.append(minibatch_avg_cost)
            # iteration number
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:
                # compute zero-one loss on validation set
                validation_losses = [
                    test_model(i) for i in xrange(n_test_batches)
                ]
                this_validation_loss = np.mean(validation_losses)

                print('epoch %i, minibatch %i/%i, validation error %f %%' %
                      (epoch, minibatch_index + 1, n_train_batches,
                       this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:
                    #improve patience if loss improvement is good enough
                    if this_validation_loss < best_validation_loss *  \
                           improvement_threshold:
                        patience = max(patience, iter * patience_increase)

                    best_validation_loss = this_validation_loss
                    best_iter = iter
                    best_params = classifier.params

            if (best_validation_loss < 0.005):
                done_looping = True
                print "Best Validation cost: ", best_validation_loss
                break

        mean_cost = np.mean(training_cost)
        print "Epoch ", epoch, " training cost: ", mean_cost

    end_time = time.clock()
    print(('Optimization complete. Best validation score of %f %% '
           'obtained at iteration %i, with test performance %f %%') %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))

    print("Saving the mlp best params")
    data_io.save_model(best_params, prefix="theano_")

    ############################
    #Making Predictions
    ############################

    print("Making predictions")
    predictions = predict_model(
    )  #classifier.predict_proba(features_valid)[:,1]
    predictions = list(predictions)

    author_predictions = defaultdict(list)
    paper_predictions = {}

    for (a_id, p_id), pred in zip(author_paper_ids, predictions):
        author_predictions[a_id].append((pred, p_id))

    for author_id in sorted(author_predictions):
        paper_ids_sorted = sorted(author_predictions[author_id], reverse=True)
        paper_predictions[author_id] = [x[1] for x in paper_ids_sorted]

    print("Writing predictions to file")
    data_io.write_submission(paper_predictions, prefix="theano_")