def main(): print("Reading training data") train = data_io.read_train() train.fillna(-1, inplace=True) #train_sample = train.fillna(value=-2) #train_sample = train[:2500000].fillna(value=0) train_sample = train[:100000] #train_sample = train.fillna(value=0) feature_names = list(train_sample.columns) feature_names.remove("click_bool") feature_names.remove("booking_bool") feature_names.remove("gross_bookings_usd") feature_names.remove("date_time") feature_names.remove("position") features = train_sample[feature_names].values #train_sample["position"] *= -1.0 #target = train_sample["position"].values #target = train_sample["booking_bool"].values target = train_sample["booking_bool"].values print("Training the Classifier") classifier = LambdaMART(n_estimators=50, verbose=2, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): X,y = load_train_set() X = delete_unused_columns(X) #X,y = sample(X,y, 0.1) #X,y = selectAllCategorical(X,y) #print X.shape #exit() # # import re # prog = re.compile(".*_[1,3]") # matches = [prog.match(i) is not None for i in X.index] # X,y = X[matches],y[matches] params = {'n_estimators': 3000, 'subsample': 0.6, 'random_state': 0, 'verbose':90, 'min_samples_split': 5, 'learning_rate': 0.00636406103119062, 'max_depth': 12, 'min_samples_leaf': 59} #params = {'n_estimators': 3000, 'subsample': 0.6, 'random_state': 0, 'verbose':90, 'min_samples_split': 5, 'learning_rate': 0.1, 'max_depth': 12, 'min_samples_leaf': 59} print params score, c = cross_val(X, y, clf,params = params, n_folds = 2, shuffle = True, score_func = fit_clf, test_size = 0.10 ) #bestClf = data_io.load_model();print "AUC", auc(y, bestClf.predict(X));exit(0) bestClf = clf(**params);bestClf.fit(X, y.Target);print "AUC", auc(y, bestClf.predict(X)) #print_importances(X, bestClf, 1) print("Saving the classifier") data_io.save_model(bestClf)
def main(): classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0) classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0) classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski') classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10) classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0) classifier6 = GaussianNB() classifier7 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['Id'], axis = 1, inplace = True) #print train.head() train = train.values #eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard') #eclf = EnsembleClassifier(clfs = [classifier1], voting = 'hard') eclf = classifier3 #scores = cross_val_score(estimator = eclf, X = train[0:,0:-1], y = train[0:,-1], cv = 10, scoring = 'roc_auc') #print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std())) eclf.fit(train[0:,0:-1],train[0:,-1]) # importances = eclf.feature_importances_ # indices = np.argsort(importances)[::-1] # for f in range(train[0:,0:-1].shape[1]): # print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]])) # print("Saving the classifier") data_io.save_model(eclf)
def run(self): print "Preparing the environment" self.prepareEnvironment() print "Reading in the training data" imageCollections = data_io.get_train_df() wndchrmWorker = WndchrmWorkerTrain() print "Getting features" if not self.loadWndchrm: #Last wndchrm set of features featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Saving images" imageSaver = ImageSaver(coordinates[indexes], namesObservations[indexes], imageCollections, featureGetter.patchSize, target[indexes]) imageSaver.saveImages() print "Executing wndchrm algorithm and extracting features" (train, target) = wndchrmWorker.executeWndchrm() else: (train, target) = wndchrmWorker.loadWndchrmFeatures() print "Training the model" model = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=30, random_state=1, compute_importances=True) model.fit(train, target) print model.feature_importances_ print "Saving the classifier" data_io.save_model(model)
def main(): print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, max_features=None) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier, prefix="forest_")
def main(argv): n=None try: opts, args = getopt.getopt(argv,"ht:s:",["train=", "settings="]) except getopt.GetoptError: print 'test.py -t <train number> -s <settings file>' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py -t <train number>' sys.exit() elif opt in ("-t", "--train"): n = int(arg) elif opt in ("-s", "--settings"): settings = arg print("Reading in the training data") train = data_io.read_train_pairs(settings) target = data_io.read_train_target(settings) print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier, settings)
def main(): classifier1 = RandomForestClassifier(n_estimators = 100, max_features=0.5, max_depth=5.0) classifier2 = DecisionTreeClassifier(max_depth = 10, criterion = 'entropy', random_state = 0) classifier3 = KNeighborsClassifier(n_neighbors = 5, p = 2, metric = 'minkowski') classifier4 = SVC(kernel = 'rbf', C = 10.0, random_state = 0, gamma = 0.10) classifier5 = LogisticRegression(penalty = 'l2', C = 1.0, random_state = 0) classifier6 = GaussianNB() print("Reading in the training data") train = data_io.get_train_df() print ("Cleaning data. Check here for imputation, One hot encoding and factorization procedures..") train = FeatureConverter().clean_data(train) train.drop(['PassengerId'], axis = 1, inplace = True) #print train.head() train = train.values eclf = EnsembleClassifier(clfs = [classifier1, classifier2, classifier3, classifier5, classifier6], voting = 'hard') #eclf = EnsembleClassifier(clfs = [classifier2], voting = 'hard') scores = cross_val_score(estimator = eclf, X = train[0:,1:], y = train[0:,0], cv = 10, scoring = 'roc_auc') print("Accuracy: %0.4f (+/- %0.3f)" % (scores.mean(), scores.std())) eclf.fit(train[0:,1:],train[0:,0]) print("Saving the classifier") data_io.save_model(eclf)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') parser.add_argument('-c1', type=int, action='store', dest='ucluster_num', help='cluster number of users') parser.add_argument('-c2', type=int, action='store', dest='icluster_num', help='cluster number of items') if len(sys.argv) != 7: print 'Command e.g.: python cluster.py -t 0(1) -c1 20 -c2 50' sys.exit(1) para = parser.parse_args() if para.target == 0: user_features = [entry for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE"]))] item_features = [entry for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE"]))] elif para.target == 1: user_features = [entry for entry in csv.reader(open(settings["USER_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))] item_features = [entry for entry in csv.reader(open(settings["ITEM_CLUSTER_TRAIN_FILE_FOR_SUBMIT"]))] else: print 'Invalid train data target choice...' sys.exit(1) user_features = [map(int, entry[1:]) for entry in user_features] item_features = [map(int, entry[1:]) for entry in item_features] cluster = KMeans(n_clusters=para.ucluster_num) cluster.fit(user_features) data_io.save_model(cluster, settings["USER_CLUSTER_MODEL_FILE"]) cluster = KMeans(n_clusters=para.icluster_num) cluster.fit(item_features) data_io.save_model(cluster, settings["ITEM_CLUSTER_MODEL_FILE"])
def main(): print("Reading training data ...") train = data_io.read_train() train.fillna(0, inplace=True) train_sample = train.fillna(value=0) features = ut.preprocess(train_sample) target = ut.construct_target(train_sample) # target = train_sample["booking_bool"].values # save the processed data, which may be useful # to test the performance of our model print("Saving processed training data ...") data_io.save_processed_data([features, target]) print("Training the Regressor ...") regressor = RandomForestRegressor(n_estimators=10, #RandomForestClassifier verbose=2, n_jobs=-1, max_features = "sqrt", min_samples_split=10, random_state=1) regressor.fit(features, target) print("Saving the Regressor ...") data_io.save_model(regressor)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') if len(sys.argv) != 3: print 'Command e.g.: python train.py -t 0(1)' sys.exit(1) para = parser.parse_args() if para.target == 0: features_targets = [entry for entry in csv.reader(open(settings["LR_TRAIN_FILE"]))] elif para.target == 1: features_targets = [entry for entry in csv.reader(open(settings["LR_TRAIN_FILE_FOR_SUBMIT"]))] else: print 'Invalid train data target choice...' sys.exit(1) features = [map(float, entry[2:-1]) for entry in features_targets] targets = [map(int, entry[-1]) for entry in features_targets] '''classifier = GradientBoostingClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1)''' classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) classifier.fit(features, targets) data_io.save_model(classifier, settings["LR_MODEL_FILE"])
def main(): sample_size = int(sys.argv[1]) train = data_io.read_train() print("Data Size:") print(train.shape) feature_eng(train) ## originally sample size = 100000 train_sample = train[:sample_size] ## Train the booking model for i in range(0,2): if i==0: model_name = "Booking" response_name = "booking_bool" isBook = True else: model_name = "Click" response_name = "click_bool" isBook = False print("Training the "+model_name+" Classifier...") tstart = datetime.now() feature_names = get_features(train_sample, isBook) print("Using "+str(len(feature_names))+" features...") features = train_sample[feature_names].values target = train_sample[response_name].values classifier = model.model() classifier.fit(features, target) # print the time interval print("Time used,") print datetime.now() - tstart print("Saving the classifier...") tstart = datetime.now() data_io.save_model(classifier, isBook) print("Time used,") print datetime.now() - tstart
def Logistic_Regression_Classifier(features, target): print("===== LogisticRegression =====") print("[INFO] Training the Classifier") classifier = LogisticRegression(penalty='l1', dual=False, tol=0.000001, C=0.1, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def Gradient_Boosting_Classifier(features, target): print("===== GradientBoosting =====") print("[INFO] Training the Classifier") classifier = GradientBoostingClassifier(learning_rate=0.1,n_estimators=50,max_depth=5,verbose=2,min_samples_split=10,max_features=9,random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Getting features for deleted papers from the database") if(os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if(os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [[0] for x in range(len(features_deleted))] + [[1] for x in range(len(features_conf))] featuresInts = [] for tup in features: a, b, c, d, e = tup featuresInts.append((int(a), int(b), int(c), int(d), int(e))) trainSet = zip(featuresInts, target) N = 5 #N : number of inputs/neurons for input layer H1 = 100 #H : number of neurons in hidden layer-1 #H2 = 5 M = 1 #number of outputs/neurons of the output layer learningRate = 0.1 epochs = 1000 #define layers of MLP keeping in mind that output of one layer is the number of inputs for the next layer layer0 = Layer(nNeurons=N, nInpsPerNeuron=-1, transferF='identity', ilayer=0, seed=13) #input layer layer1 = Layer(nNeurons=H1, nInpsPerNeuron=N, transferF='tanh', ilayer=1, seed=13) #hidden layer 1 layer2 = Layer(nNeurons=M, nInpsPerNeuron=H1, transferF='tanh', ilayer=2, seed=13) #output layer #layer3 = Layer(nNeurons=M, nInpsPerNeuron=H2, transferF='logistic', ilayer=3) #output layer layers = [layer0, layer1, layer2 ] mlp = Mlp(layers) mlp.showMlp() print "\n\nTraining Mlp for", epochs," Epochs.... please wait... " trainedMlp, iterations = mlp.trainMlp(trainSet, learningRate, epochs) print "\n\nFinished training of Mlp " trainedMlp.showMlp() print("Saving the classifier") data_io.save_model(mlp,prefix="mlp_")
def Gaussian_Process_Regression(features, target): print("===== GaussianProcess =====") print("[INFO] Training the Classifier") classifier = GaussianProcess(theta0=0.1, thetaL=0.001, thetaU=1.0) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def Random_Forest_Classifier(features, target): print("===== RandomForest =====") print("[INFO] Training the Classifier") max_f = min(9, len(features[0])) # TODO(nkhadke, senwu): Figure out multiprocessing error classifier = RandomForestClassifier(n_estimators=1000, verbose=2, n_jobs=1, max_depth=10, min_samples_split=10, max_features=max_f, random_state=1, criterion='gini', compute_importances='True') classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") classifier = get_pipeline(train) classifier.fit(train, train["SalaryNormalized"]) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") for key in train: classifier = get_pipeline(train[key]) classifier.fit(train[key], train[key]["SalaryNormalized"]) print("Saving the classifier for %s" %key) data_io.save_model(classifier,key)
def main(): print("Reading in the training data") train = data_io.get_train_df() print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train[[x for x in train.columns if x != 'label']], train['label']) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train, train_labels = data_io.get_train() print("Extracting features and training model") classifier = RandomForestClassifier(n_estimators = 500, min_samples_leaf = 1) classifier.fit(train, train_labels) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.read_train_pairs() target = data_io.read_train_target() print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the training data") train = data_io.get_train_df() mean = train["SalaryNormalized"].mean() print("The mean salary is %f" % mean) print("Saving the model") data_io.save_model(mean) predictions = [mean] * len(train) print(metrics.MAE(predictions, train["SalaryNormalized"].tolist()))
def main(): set1 = 'train' if len(sys.argv) < 2 else sys.argv[1] set2 = [] if len(sys.argv) < 3 else sys.argv[2:] train_filter = None train_filter2 = None model = MODEL(**MODEL_PARAMS) print("Reading in training data " + set1) train = data_io.read_data(set1) print("Extracting features") train = model.extract(train) print("Saving train features") data_io.write_data(set1, train) target = data_io.read_target(set1) train2 = None target2 = None for s in set2: print "Reading in training data", s tr = data_io.read_data(s) print "Extracting features" tr = model.extract(tr) print "Saving train features" data_io.write_data(s, tr) tg = data_io.read_target(s) train2 = tr if train2 is None else pd.concat((train2, tr), ignore_index=True) target2 = tg if target2 is None else pd.concat((target2, tg), ignore_index=True) train2, target2 = util.random_permutation(train2, target2) train_filter2 = ((train2['A type'] != 'Numerical') & (train2['B type'] == 'Numerical')) #train_filter2 |= ((train2['A type'] == 'Numerical') & (train2['B type'] != 'Numerical')) # Data selection train, target = util.random_permutation(train, target) train_filter = ((train['A type'] != 'Numerical') & (train['B type'] == 'Numerical')) #train_filter |= ((train['A type'] == 'Numerical') & (train['B type'] != 'Numerical')) if train_filter is not None: train = train[train_filter] target = target[train_filter] if train_filter2 is not None: train2 = train2[train_filter2] target2 = target2[train_filter2] print("Training model with optimal weights") X = pd.concat([train, train2]) if train2 is not None else train y = np.concatenate((target.Target.values, target2.Target.values)) if target2 is not None else target.Target.values model.fit(X, y) model_path = "cnmodel.pkl" print "Saving model", model_path data_io.save_model(model, model_path)
def main(): #sample_size = int(sys.argv[1]) ## sample_size = int(1000) # read train.csv train = pd.read_csv(data_io.data_path + "train_set.csv", index_col=False, header=None) train.columns = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'expense', 'count'] print "Data Size:", (train.shape) # feature engineering #feature_eng(train) ## originally sample size = 100000 train = train[:300000] # # book_trainset = train_set[train_set['booking_bool']==1] # book_rows = book_trainset.index.tolist() # bsize = len(book_trainset.index) # click_trainset = train_set[train_set['click_bool']==1] # click_rows = click_trainset.index.tolist() # csize = len(click_trainset.index) # print 'bsize ' + str(bsize) # print 'csize ' + str(csize) # book_trainset = book_trainset.append(train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)]) # click_trainset =click_trainset.append(train_set.ix[random.sample(train_set.drop(click_rows).index, csize)]) #book_trainset = train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)] model_name = "predict_model" response_name = 'count' feature_names = ['year', 'month', 'trade_no', 'sigungu_no', 'price', 'expense'] #get_features() print "Training the " + model_name + " Classifier..." print "Using " + str(len(feature_names)) + " features..." tstart = datetime.now() features = train[feature_names].values target = train[response_name].values classifier = model.model() classifier.fit(features, target) print "Time used,", datetime.now() - tstart print "Saving the classifier..." tstart = datetime.now() data_io.save_model(classifier, model_name) print "Time used,", datetime.now() - tstart
def main(): features_targets = [entry for entry in csv.reader(open(settings["GBT_TRAIN_FILE_FOR_SUBMIT"]))] features = [map(float, entry[:-1]) for entry in features_targets] targets = [map(int, entry[-1]) for entry in features_targets] features_targets = [] classifier = RandomForestClassifier(n_estimators=200, verbose=2, n_jobs=4, min_samples_split=10, random_state=1) classifier.fit(features, targets) data_io.save_model(classifier)
def main(): markdown = PagedownToHtml() print("Reading in the training data") train = data_io.get_train_df() for i in train.index: train["BodyMarkdown"][i] = markdown.convert(train["BodyMarkdown"][i]) print("Extracting features and training") classifier = get_pipeline() classifier.fit(train, train["OpenStatus"]) print("Saving the classifier") data_io.save_model(classifier, "model.pickle") model = data_io.load_model("model.pickle")
def main(): print("Reading in the training data") train = data_io.read_train() print("Reading in the meta data") paper_author, paper_author_indexed = f.get_paper_author() print("Computing Relational Information") computed_features = f.get_all_computed_features(paper_author) print("Extracting features") features = [] target = [] for author_id, row in train.iterrows(): for paper_id in row["DeletedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(1) features.append(s) for paper_id in row["ConfirmedPaperIds"]: s = f.get_features(paper_id, author_id, paper_author_indexed,computed_features) if s is None: print("Error at Author Id %d And Paper Id %d" % (author_id, paper_id)) else: target.append(0) features.append(s) print("Target Length: %d" % len(target)) print("Feature Length: %d" % len(features)) feature_matrix = pd.DataFrame(features) print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) try: classifier.fit(feature_matrix, target) except: import pdb;pdb.set_trace() print("Saving the classifier") data_io.save_model(classifier)
def main(): sample_size = int(sys.argv[1]) ## sample_size = int(1000) train = data_io.read_train() print("Data Size:") print(train.shape) feature_eng(train) ## originally sample size = 100000 train_set = train[:sample_size] book_trainset = train_set[train_set['booking_bool']==1] book_rows = book_trainset.index.tolist() bsize = len(book_trainset.index) click_trainset = train_set[train_set['click_bool']==1] click_rows = click_trainset.index.tolist() csize = len(click_trainset.index) print 'bsize ' + str(bsize) print 'csize ' + str(csize) book_trainset = book_trainset.append(train_set.ix[random.sample(train_set.drop(book_rows).index, bsize)]) click_trainset =click_trainset.append(train_set.ix[random.sample(train_set.drop(click_rows).index, csize)]) ## Train the booking model for i in range(0,2): if i==0: model_name = "Booking" response_name = "booking_bool" train_sample = book_trainset isBook = True else: model_name = "Click" response_name = "click_bool" train_sample = click_trainset isBook = False print("Training the "+model_name+" Classifier...") tstart = datetime.now() feature_names = get_features(train_sample, isBook) print("Using "+str(len(feature_names))+" features...") features = train_sample[feature_names].values target = train_sample[response_name].values classifier = model.model() classifier.fit(features, target) # print the time interval print("Time used,") print datetime.now() - tstart print("Saving the classifier...") tstart = datetime.now() data_io.save_model(classifier, isBook) print("Time used,") print datetime.now() - tstart
def main(): print("Reading training data") train_chunks = data_io.read_train_features() train = pandas.concat([chunk for chunk in train_chunks], ignore_index=True) print("Training five Classifier") fiveClassifier = classify(train, "five") print("Training one Classifier") oneClassifier = classify(train, "one") print("Training zero Classifier") zeroClassifier = classify(train, "zero") classifier = (fiveClassifier, oneClassifier, zeroClassifier) print("Saving the classifiers") data_io.save_model(classifier)
def main(): comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() conn = data_io.get_db_conn() feature_name = open("feature_list.txt").read().split() # if size < len(feature_name): # to be done! for table_name in ["TrainDeleted", "TrainConfirmed"]: if rank > 0: # getting features by parallel computing print "getting features at node " + str(rank) feature = data_io_parallel.get_features_db_parallel(conn, rank, table_name, feature_name[rank - 1]) else: feature = data_io_parallel.get_trained_validation_data(conn, table_name) # sending features to rank 0 print "sending features to node " + str(rank) features = comm.gather(feature, root = 0) #print features if rank == 0: temp = [] for f in features: temp.extend(f) print "Successfully got the features from " + table_name if table_name == "TrainDeleted": features_deleted = map(list, np.array(temp).T) else: features_conf = map(list, np.array(temp).T) if rank == 0: features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier) print "Training completed, exit..." comm.Abort()
def runWithoutWndchrm(self): print "Reading in the training data" imageCollections = data_io.get_train_df() print "Getting features" featureGetter = FeatureGetter() fileName = data_io.get_savez_name() if not self.load: #Last features calculated from candidates (namesObservations, coordinates, train) = Utils.calculateFeatures(fileName, featureGetter, imageCollections) else: (namesObservations, coordinates, train) = Utils.loadFeatures(fileName) print "Getting target vector" (indexes, target, obs) = featureGetter.getTargetVector(coordinates, namesObservations, train) print "Training the model" classifier = RandomForestClassifier(n_estimators=500, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, compute_importances=True) #classifier = KNeighborsClassifier(n_neighbors=50) model = Pipeline([('scaling', MinMaxScaler()), ('classifying', classifier)]) model.fit(obs[indexes], target[indexes]) print "Saving the classifier" data_io.save_model(model)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') if len(sys.argv) != 3: print 'Command e.g.: python train.py -t 0(1)' sys.exit(1) para = parser.parse_args() if para.target == 0: features_targets = [ entry for entry in csv.reader(open(settings["LR_TRAIN_FILE"])) ] elif para.target == 1: features_targets = [ entry for entry in csv.reader(open(settings["LR_TRAIN_FILE_FOR_SUBMIT"])) ] else: print 'Invalid train data target choice...' sys.exit(1) features = [map(float, entry[2:-1]) for entry in features_targets] targets = [map(int, entry[-1]) for entry in features_targets] '''classifier = GradientBoostingClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1)''' classifier = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None) classifier.fit(features, targets) data_io.save_model(classifier, settings["LR_MODEL_FILE"])
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print train print("Extracting features and training model") classifier = get_pipeline() classifier.fit(train, target.Target) features = [x[0] for x in classifier.steps[0][1].features ] csv_fea = csv.writer(open('features.csv','wb')) imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) for fea in imp: print fea[0], fea[1] csv_fea.writerow([fea[0],fea[1]]) oob_score = classifier.steps[1][1].oob_score_ print "oob score:", oob_score logger = open("run_log.txt","a") if len(oob_score) == 1: logger.write("\n" +str( oob_score) + "\n") else:logger.write("\n" + str(oob_score[0]) + "\n") print("Saving the classifier") data_io.save_model(classifier) print("Predicting the train set") train_predict = classifier.predict(train) trian_predict = train_predict.flatten() data_io.write_submission(train_predict, 'train_set', run = 'train') t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff/60,1)
def main(): print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted))] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier)
def main(): print("Reading in the raw data of features and salaries for merging") train_f = data_io.get_train_f_df() train_s = data_io.get_train_s_df() #train_f: training feature data; train_s: training salary data with 0 items deleted """ train_f.describe train_s.describe """ #merge the data by jobId, similar to SQL join data = pd.merge(train_f,train_s,how='left') data.to_csv("D:/job/indeed_data_science_exercise/RFC1/train9merge2.csv", sep=',',encoding='utf-8') # seperate the data into features set of the feature columns and the set with target column salary only #'companyId' excluded characters = ["jobType", "degree", "major", "industry", "yearsExperience", "milesFromMetropolis"] x = data[characters] y = data[['salary']] print("Extracting features and training model") classifier = get_pipeline() classifier.fit(xtr, ytr) print("Saving the classifier") data_io.save_model(classifier) print("Load testing data") testin = data_io.get_test_df() test=testin[characters] print("Making predictions") predictions = classifier.predict(test) predictions = predictions.reshape(len(predictions), 1) #classifier.get_params #pred_score=explained_variance_score(ycv, predictions, multioutput='raw_values') print("Writing predictions to file") write_submission(predictions)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-t', type=int, action='store', dest='target', help='for validation or test dataset') if len(sys.argv) != 3: print 'Command e.g.: python train.py -t 0(1)' sys.exit(1) para = parser.parse_args() if para.target == 0: features_targets = [ entry for entry in csv.reader(open(settings["MTLR_TRAIN_FILE"])) ] elif para.target == 1: features_targets = [ entry for entry in csv.reader( open(settings["MTLR_TRAIN_FILE_FOR_SUBMIT"])) ] else: print 'Invalid train data target choice...' sys.exit(1) features = [map(float, entry[2:-1]) for entry in features_targets] pairs = [map(int, entry[:2]) for entry in features_targets] targets = [map(int, entry[-1]) for entry in features_targets] classifier = MeanRegularizedMultiTaskLR(C=0.1, tol=0.0001, intercept_scaling=1, lr=0.02, eta=0.1, field_for_model_num=2, max_niters=200, confidence=20, para_init="gaussian") classifier.fit(pairs, features, targets) data_io.save_model(classifier, settings["MTLR_MODEL_FILE"])
def training(processed_train_csv_file): processed_train_samples = pd.read_csv(processed_train_csv_file) processed_train_samples = processed_train_samples.replace([np.inf, -np.inf], np.nan) processed_train_samples = processed_train_samples.fillna(value=0) processed_train_samples_index_lst = processed_train_samples.index.tolist() random.shuffle(processed_train_samples_index_lst) shuffled_train_samples = processed_train_samples.ix[processed_train_samples_index_lst] col_names = shuffled_train_samples.columns.tolist() col_names.remove("booking_bool") features = shuffled_train_samples[col_names].values labels = shuffled_train_samples["booking_bool"].values print "Training Random Forest Classifier" rf_classifier = RandomForestClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) rf_classifier.fit(features, labels) print "Saving the Random Forest Classifier" data_io.save_model(rf_classifier, model_name="rf_classifier.pkl") print "Training Gradient Boosting Classifier" gb_classifier = GradientBoostingClassifier(n_estimators=150, verbose=2, learning_rate=0.1, min_samples_split=10) gb_classifier.fit(features, labels) print "Saving the Gradient Boosting Classifier" data_io.save_model(gb_classifier, model_name="gb_classifier.pkl") print "Training SGD Classifier" sgd_classifier = SGDClassifier(loss="modifier_huber", verbose=2, n_jobs=-1) sgd_classifier.fit(features, labels) print "Saving the SGD Classifier" data_io.save_model(sgd_classifier, model_name="sgd_classifier.pkl")
def main(): print("Getting features for deleted papers from the database") if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, max_features=None) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier, prefix="forest_")
def main(): print("Getting features for each bird-class ") if(os.path.exists("features.obj")): with open("features.obj", 'r') as loadfile: features = cPickle.load(loadfile) else: features = data_io.get_features_csv() with open("features.obj", 'w') as dumpfile: cPickle.dump(features, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) target = [] print("Training the Classifier") classifier = RandomForestClassifier(n_estimators=100, verbose=2, n_jobs=1, min_samples_split=10, random_state=1, max_features=None) classifier.fit(features, target) print("Saving the classifier") data_io.save_model(classifier, prefix="forest_")
def run(self): features = f.features train = self.getTrainingDataset() print "Reading preprocessed features" if f.preprocessedFeatures != []: intermediate = data_io.read_intermediate_train() for i in f.preprocessedFeatures: train[i] = intermediate[i] for i in features: if i[0] in f.preprocessedFeatures: i[1] = i[0] i[2] = f.SimpleTransform(transformer=f.ff.identity) print "Reading targets" target = data_io.read_train_target() print "Extracting features and training model" classifier = self.getPipeline(features) if self.directionForward: finalTarget = [x * (x + 1) / 2 for x in target.Target] else: finalTarget = [-x * (x - 1) / 2 for x in target.Target] classifier.fit(train, finalTarget) print classifier.steps[-1][1].feature_importances_ print "Saving the classifier" data_io.save_model(classifier)
verbose=2, n_jobs=1, oob_score=False, min_samples_split=2, random_state=3465343) classifier = AdaBoostRegressor(base_estimator=base_clf, n_estimators=n_trees, random_state=3465343) classifier.fit(features, salaries) predictions = classifier.predict(validation_features) print valid_salaries[1:10] print np.exp(predictions[1:10]) mae = mean_absolute_error(valid_salaries, np.exp(predictions)) print "MAE validation: ", mae save_model(classifier, name, mae) #joblib.dump(predictions, path_join(prediction_dir, name + "_prediction_valid")) #oob_predictions = classifier.oob_prediction_ #mae_oob = mean_absolute_error(salaries, oob_predictions) #print "MAE OOB: ", mae_oob base_clf = ExtraTreesRegressor(n_estimators=10, verbose=2, n_jobs=1, oob_score=False, min_samples_split=2, random_state=3465343) classifier = AdaBoostRegressor(base_estimator=base_clf, n_estimators=n_trees, random_state=3465343) scores = cross_val_score(classifier, features, salaries, cv=3, score_func=log_mean_absolute_error, verbose=1) print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
def write_output(self, model, file): data_io.save_model(model, file)
def main(): ''' print("Getting features for deleted papers from the database") features_deleted = data_io.get_features_db("TrainDeleted") print("Getting features for confirmed papers from the database") features_conf = data_io.get_features_db("TrainConfirmed") ''' features_deleted = pickle.load( open(data_io.get_paths()["deleted_features"], 'rb')) features_conf = pickle.load( open(data_io.get_paths()["confirmed_features"], 'rb')) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] print("Training the Classifier") features = np.array(features) target = np.array(target) ''' classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1) classifier.fit(features, target) ''' #Referred https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/ for parameter tuning param_test1 = {'max_depth': [19], 'min_child_weight': [1]} param_test2 = {'gamma': [i / 10.0 for i in range(0, 5)]} param_test3 = { 'subsample': [i / 10.0 for i in range(6, 10)], 'colsample_bytree': [i / 10.0 for i in range(6, 10)] } ''' gsearch1 = GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=19, min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9, objective='binary:logistic', scale_pos_weight=1, seed=27), param_grid=param_test1, scoring='roc_auc', n_jobs=4, iid=False, cv=5) gsearch1.fit(features, target) print(gsearch1.grid_scores_) print(gsearch1.best_params_) print(gsearch1.best_score_) exit() ''' ''' classifier = xgb.XGBClassifier(learning_rate=0.03, n_estimators=300, max_depth=19, min_child_weight=1, gamma=0.1, subsample=0.9, colsample_bytree=0.9, objective='binary:logistic', seed=27).fit(features, target) ''' ''' classifier = RandomForestClassifier(n_estimators=50, verbose=2, n_jobs=1, min_samples_split=10, random_state=1).fit(features, target) ''' ''' print(len(features)) a = np.random.permutation(len(features))[0:10000] features = features[a] target = target[a] classifier = svm.SVC(probability=True).fit(features, target) ''' #classifier = GaussianNB().fit(features, target) classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit( features, target) print("Saving the classifier") data_io.save_model(classifier) # accuracy 0.9729 for valid set #classifier = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=0.05, objective="binary:logistic").fit(features, target) ''' accuracy 0.9723 for valid set
if __name__ == "__main__": print("Reading in the training data") train_raw = data_io.read_train_pairs() target = data_io.read_train_target() info = data_io.read_train_info() info['iindex'] = range(4050) train = train_raw.join(info) classifier = get_pipeline() ### FOLDS CODE # folds = cval.KFold(len(train), n_folds=2, indices=False) # # results = [] # for i, fold in enumerate(folds): # print("Extracting features and training model for fold " + str(i)) # traincv, testcv = fold # classifier.fit(train[traincv], target[traincv]) # results.append(classifier.score(train[testcv], target[testcv])) # # print(results) # print('Score: ' + str(np.array(results).mean())) ### REGULAR RUN classifier.fit(train, target.Target) print("Saving the classifier") data_io.save_model(classifier)
#print "Feature ranking:" #for f in xrange(len(indices)): #print "%d. feature %d %s (%f)" % (f + 1, indices[f], names[indices[f]], importances[indices[f]]) ## Plot the feature importances of the forest #import pylab as pl #pl.figure() #pl.title("Feature importances") #pl.bar(xrange(len(indices)), importances[indices], #color="r", align="center") #pl.xticks(xrange(len(indices)), indices) #pl.xlim([-1, len(indices)]) #pl.show() #a=5/0 save_model(classifier, name, mae) #joblib.dump(predictions, path_join(prediction_dir, name + "_prediction_valid")) #kmeans = KMeans( #random_state=3465343, #n_clusters=n_clusters, #n_jobs=-1, #verbose=0) #classifier = Pipeline(steps=[('knn', kmeans), ('tree', DecisionTreeRegressor())]) #classifier = DecisionTreeRegressor(max_depth=3, random_state=3465343) #classifier = ExtraTreesRegressor(n_estimators=n_trees, #verbose=1, #n_jobs=-1, #oob_score=False, #min_samples_split=min_samples_split, #random_state=3465343) clf = SGDRegressor(random_state=3465343, verbose=0, n_iter=n_trees)
def write_output(self, model, file): #save the output data_io.save_model(model, file)
def main(): t1 = time() print("Reading in the training data") train = data_io.read_train_pairs() train_info = data_io.read_train_info() train = combine_types(train, train_info) #make function later train = get_types(train) target = data_io.read_train_target() print "Reading SUP data..." for i in range(1, 4): print "SUP", str(i) sup = data_io.read_sup_pairs(i) sup_info = data_io.read_sup_info(i) sup = combine_types(sup, sup_info) sup = get_types(sup) sup_target = data_io.read_sup_target(i) train_info = train_info.append(sup_info) train = train.append(sup) target = target.append(sup_target) # Old train print "Reading old train data..." old_train = data_io.read_old_train_pairs() old_train_info = data_io.read_old_train_info() old_train = combine_types(old_train, old_train_info) old_train = get_types(old_train) old_target = data_io.read_old_train_target() train = train.append(old_train) target = target.append(old_target) # End old train print "Train size = ", str(train.shape) print("Extracting features and training model") feature_trans = fe.feature_extractor() orig_train = feature_trans.fit_transform(train) orig_train = numpy.nan_to_num(orig_train) classifier = classify_catagory(orig_train, target.Target) #(both_classifier, A_classifier, B_classifier, none_classifier) = create_classifiers(orig_train, target.Target, train_info) print("Saving features") data_io.save_features(orig_train) print("Saving the classifier") #data_io.save_model( (both_classifier, A_classifier, B_classifier, none_classifier) ) data_io.save_model(classifier) #features = [x[0] for x in classifier.steps[0][1].features ] #csv_fea = csv.writer(open('features.csv','wb')) #imp = sorted(zip(features, classifier.steps[1][1].feature_importances_), key=lambda tup: tup[1], reverse=True) #for fea in imp: # print fea[0], fea[1] # csv_fea.writerow([fea[0],fea[1]]) t2 = time() t_diff = t2 - t1 print "Time Taken (min):", round(t_diff / 60, 1)
def test_mlp(learning_rate=0.013, L1_reg=0.00, L2_reg=0.0003, n_epochs=300, n_hidden=100): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(0) print("Getting features for bird classes") if(os.path.exists("features.obj")): with open("features.obj", 'r') as loadfile: features, target = cPickle.load(loadfile) else: features, target = data_io.get_features_mat() with open("features.obj", 'w') as dumpfile: cPickle.dump((features, target), dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) datasets = load_data(features, target)#gen_data2() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] batch_size = 16 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data index = T.lscalar() x = T.matrix('x', dtype='float64') # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=features.shape[1], n_hidden=n_hidden, n_out=35) # the cost we minimize during training is the negative log likelihood of # the model plus the regularization terms (L1 and L2); cost is expressed # here symbolically cost = classifier.negative_log_likelihood(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr # compiling a Theano function that computes the mistakes that are made # by the model on a minibatch # test_model = theano.function(inputs=[size], # outputs=[classifier.errors(y),classifier.getPredictions()], # givens={ # x: test_set_x[0:size], # y: test_set_y[0:size]} # ) # # validate_model = theano.function(inputs=[size], # outputs=[classifier.errors(y),classifier.getPredictions()], # givens={ # x:valid_set_x[0:size], # y:valid_set_y[0:size]} # ) test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size]}) # predict_model = theano.function(inputs=[], # outputs=classifier.predictions(), # givens={ # x: predict_set_x}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = {} # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam # compiling a Theano function `train_model` that returns the cost, but # in the same time updates the parameter of the model based on the rules # defined in `updates` # train_model = theano.function(inputs=[size], # outputs=cost, # updates=updates, # givens={ # x: train_set_x[0:size], # y: train_set_y[0:size]} # ) train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size]}) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): #datasets = load_data(featuresMat, targetInts)#permute data() # # train_set_x, train_set_y = datasets[0] # valid_set_x, valid_set_y = datasets[1] # test_set_x, test_set_y = datasets[2] epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] this_validation_loss = np.mean(validation_losses) # print('epoch %i, minibatch %i/%i, validation error %f %%' % # (epoch, minibatch_index + 1, n_train_batches, # this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = [] best_params.append(classifier.params) # test it on the test set test_losses = [test_model(i) for i in xrange(n_test_batches)] test_score = np.mean(test_losses) # print((' epoch %i, minibatch %i/%i, test error of ' # 'best model %f %%') % # (epoch, minibatch_index + 1, n_train_batches, # test_score * 100.)) # mean_cost = np.mean(training_cost) if(mean_cost < 0.0005): done_looping = True print "training cost: ", mean_cost break print "Epoch ", epoch," training cost: ", mean_cost end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_")
def test_mlp(learning_rate=0.013, L1_reg=0.0001, L2_reg=0.0003, n_epochs=10000, n_hidden=50, n_hidden2=10): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient :type L1_reg: float :param L1_reg: L1-norm's weight when added to the cost (see regularization) :type L2_reg: float :param L2_reg: L2-norm's weight when added to the cost (see regularization) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer """ np.random.seed(17) print("Getting features for deleted papers from the database") features_deleted = None features_conf = None if (os.path.exists("features_deleted.obj")): with open("features_deleted.obj", 'r') as loadfile: features_deleted = cPickle.load(loadfile) else: features_deleted = data_io.get_features_db("TrainDeleted") with open("features_deleted.obj", 'w') as dumpfile: cPickle.dump(features_deleted, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for confirmed papers from the database") if (os.path.exists("features_confirmed.obj")): with open("features_confirmed.obj", 'r') as loadfile: features_conf = cPickle.load(loadfile) else: features_conf = data_io.get_features_db("TrainConfirmed") with open("features_confirmed.obj", 'w') as dumpfile: cPickle.dump(features_conf, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) print("Getting features for valid papers from the database") if (os.path.exists("features_valid.obj")): with open("features_valid.obj", 'r') as loadfile: data = cPickle.load(loadfile) else: data = data_io.get_features_db("ValidPaper") with open("features_valid.obj", 'w') as dumpfile: cPickle.dump(data, dumpfile, protocol=cPickle.HIGHEST_PROTOCOL) author_paper_ids = [x[:2] for x in data] features_valid = [x[2:] for x in data] features_validnp = np.array(features_valid, dtype='float64') # predictInts = [] # for tup in features_valid: # a, b, c, d, e = tup # predictInts.append((int(a), int(b), int(c), int(d), int(e))) # # predictsMat = np.ndarray(shape=(len(predictInts), 5), dtype='int32') # for i, tup in enumerate(predictInts): # a, b, c, d, e = tup # predictsMat[i, 0] = a; predictsMat[i, 1] = b; predictsMat[i, 2] = c; predictsMat[i, 3] = d; predictsMat[i, 4] = e; predict_set_x = theano.shared(features_validnp, borrow=True) features = [x[2:] for x in features_deleted + features_conf] target = [0 for x in range(len(features_deleted)) ] + [1 for x in range(len(features_conf))] featuresnp = np.array(features, dtype='float64') targetnp = np.array(target, dtype='int32') featuresnp -= np.mean(featuresnp, axis=0) featuresnp /= np.std(featuresnp, axis=0) cv = cross_validation.ShuffleSplit(len(features), n_iter=1, test_size=0.25, random_state=0) for train, test in cv: train_set_x = theano.shared(featuresnp[train], borrow=True) test_set_x = theano.shared(featuresnp[test], borrow=True) train_set_y = theano.shared(targetnp[train], borrow=True) test_set_y = theano.shared(targetnp[test], borrow=True) batch_size = 20 # size of the minibatch # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### print '... building the model' # allocate symbolic variables for the data # size = T.lscalar() index = T.lscalar() x = T.matrix( 'x', dtype='float64' ) # sparse.csr_matrix('x', dtype='int32'); the data is presented as sparse matrix y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels rng = np.random.RandomState(113) # construct the MLP class classifier = MLP(rng=rng, input=x, n_in=featuresnp.shape[1], n_hidden=n_hidden, n_out=2, n_hidden2=10) cost = classifier.negative_log_likelihood(y) \ + L1_reg * classifier.L1 \ + L2_reg * classifier.L2_sqr test_model = theano.function( inputs=[index], outputs=classifier.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) predict_model = theano.function(inputs=[], outputs=classifier.predictions(), givens={x: predict_set_x}) # compute the gradient of cost with respect to theta (sotred in params) # the resulting gradients will be stored in a list gparams gparams = [] for param in classifier.params: gparam = T.grad(cost, param) gparams.append(gparam) # specify how to update the parameters of the model as a dictionary updates = OrderedDict() # given two list the zip A = [a1, a2, a3, a4] and B = [b1, b2, b3, b4] of # same length, zip generates a list C of same size, where each element # is a pair formed from the two lists : # C = [(a1, b1), (a2, b2), (a3, b3), (a4, b4)] for param, gparam in zip(classifier.params, gparams): updates[param] = param - learning_rate * gparam train_model = theano.function( inputs=[index], outputs=cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] }) ############### # TRAIN MODEL # ############### print '... training' # early-stopping parameters patience = 1000000 # look as this many examples regardless patience_increase = 2 # wait this much longer when a new best is # found improvement_threshold = 0.0995 # a relative improvement of this much is # considered significant validation_frequency = min(n_train_batches, patience / 2) # go through this many # minibatche before checking the network # on the validation set; in this case we # check every epoch best_params = None best_validation_loss = np.inf best_iter = 0 test_score = 0. start_time = time.clock() epoch = 0 done_looping = False best_params = None while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 training_cost = [] for minibatch_index in xrange(n_train_batches): minibatch_avg_cost = train_model(minibatch_index) training_cost.append(minibatch_avg_cost) # iteration number iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ test_model(i) for i in xrange(n_test_batches) ] this_validation_loss = np.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * \ improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = this_validation_loss best_iter = iter best_params = classifier.params if (best_validation_loss < 0.005): done_looping = True print "Best Validation cost: ", best_validation_loss break mean_cost = np.mean(training_cost) print "Epoch ", epoch, " training cost: ", mean_cost end_time = time.clock() print(('Optimization complete. Best validation score of %f %% ' 'obtained at iteration %i, with test performance %f %%') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) print("Saving the mlp best params") data_io.save_model(best_params, prefix="theano_") ############################ #Making Predictions ############################ print("Making predictions") predictions = predict_model( ) #classifier.predict_proba(features_valid)[:,1] predictions = list(predictions) author_predictions = defaultdict(list) paper_predictions = {} for (a_id, p_id), pred in zip(author_paper_ids, predictions): author_predictions[a_id].append((pred, p_id)) for author_id in sorted(author_predictions): paper_ids_sorted = sorted(author_predictions[author_id], reverse=True) paper_predictions[author_id] = [x[1] for x in paper_ids_sorted] print("Writing predictions to file") data_io.write_submission(paper_predictions, prefix="theano_")