def main(): data = pd.read_csv('data/svm-data.csv', header=None) X = data.iloc[:, 1:].values y = data.iloc[:, 0].values clf = SVC(C=100000, random_state=241) clf.fit(X, y) result = str([x+1 for x in clf.support_]) write_submission(result, '61')
def main(): model = utils.load_model() valid_df = utils.get_valid_df() predictions = model.predict(valid_df) predictions = predictions.reshape(len(predictions), 1) utils.write_submission(predictions)
def main(): boston = load_boston() X = scale(boston.data) y = boston.target classifier = KNeighborsRegressor(n_neighbors=5, weights='distance') kf = KFold(len(X), n_folds=5, shuffle=True, random_state=42) neighbors_range = np.linspace(1.0, 10.0, num=200) write_submission( int(classifier_choice_cv( X, y, classifier, 'p', neighbors_range, kf, scoring='mean_squared_error')[0]), '41')
def main(): wine_file = 'data/wine.data' url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' try: data = pd.read_csv(wine_file, header=None) except IOError: print ('No {0} file found, downloading it from {1}' .format(wine_file, url)) wine_reques = requests.get(url).content data = pd.read_csv(io.StringIO(wine_reques.decode('utf-8')), header=None) X = data.iloc[:, 1:].values X_scaled = scale(X) y = data.iloc[:, 0].values kf = KFold(len(data.index), n_folds=5, shuffle=True, random_state=42) neighbors_range = [x for x in range(1, 51)] write_submission( classifier_choice_cv( X, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[0], '31') write_submission( classifier_choice_cv( X, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[1], '32') write_submission( classifier_choice_cv( X_scaled, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[0], '33') write_submission( classifier_choice_cv( X_scaled, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[1], '34')
def generate_predict(self): # Train model and create a submission if not self.hasTrained: train_x, y = self.load_train() self.fit(train_x, y) self.hasTrained = True print("Creating submission") test_features = self.load_test() preds = self.compute_predict(test_features) preds = self.transform(preds) u.write_submission(preds.astype(int), self.subm + self.name + "_submission.csv") print("Submission successfully created")
def trees_assignment(): data = pd.read_csv("data/titanic.csv", index_col="PassengerId") data21 = data.dropna(subset=["Pclass", "Fare", "Age", "Survived", "Sex"]) pd.options.mode.chained_assignment = None # suppress false positive warn data21["Sex"] = data21["Sex"].map({"female": 0, "male": 1}) pd.options.mode.chained_assignment = "warn" # turning warn back feature_names = ["Pclass", "Fare", "Age", "Sex"] X = data21[feature_names] y = data21[["Survived"]] clf = DecisionTreeClassifier(random_state=241) clf.fit(X, y) importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] subm21 = [feature_names[f] for f in indices][:2] write_submission([x for x in subm21], "21")
def main(): data = {} scaler = StandardScaler() for data_type in ['train', 'test']: df = pd.read_csv('data/perceptron-{0}.csv'.format(data_type), header=None) data['X_' + data_type] = df.iloc[:, 1:].values data['y_' + data_type] = df.iloc[:, 0].values data['X_train_scaled'] = scaler.fit_transform(data['X_train']) data['X_test_scaled'] = scaler.transform(data['X_test']) acc = get_accuracy(data['X_train'], data['y_train'], data['X_test'], data['y_test'],) acc_scaled = get_accuracy(data['X_train_scaled'], data['y_train'], data['X_test_scaled'], data['y_test']) write_submission(round(abs(acc - acc_scaled), 3), '51')
def main(): if not os.path.exists(ckpt_path + 'checkpoint'): print('there is not saved model, please check the ckpt path') exit() print('Loading model...') W_embedding = np.load(embedding_path) config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: model = network.TextCNN(W_embedding, settings) model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path)) #print('Local predicting...') #print ('valid batches:%d'%n_va_batches) #local_predict(sess, model) print('Test predicting...') print('test batches:%d' % n_tr_batches) results = predict(sess, model) sub_path_name = sub_path + model_name + str( strftime("%m%d%H%M")) + '.csv' id_list = get_id_list(id_list_path) # id_list=id_list[:len(results)] write_submission(sub_path_name, id_list, results, sr_id2title)
def main(): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') newsgroups = fetch_20newsgroups(subset='all', categories=['alt.atheism', 'sci.space']) vectorizer = TfidfVectorizer() X = newsgroups.target y = newsgroups.data X_train = vectorizer.fit_transform(y) grid = {'C': np.power(10.0, np.arange(-5, 6))} cv = KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=241) clf = SVC(kernel='linear', random_state=241) gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv) gs.fit(X_train, X) clf.set_params(**gs.best_params_) clf.fit(X_train, X) result = (show_top10(clf, vectorizer)) result.sort() write_submission(str( [x for x in result]).lower().encode('ascii', 'ignore'), '71') # still need some work to get rid of unicode problem
def main(): data = np.genfromtxt("data/data-logistic.csv", delimiter=",") y = data[:, 0] X = data[:, 1:] e_convergence = 0.00001 max_iteration = 10000 c_reg = 10 k_step = 0.1 weights_start = [[0.0, 0.0]] euclidean_distance = maxint iteration_count = 0 while math.sqrt(euclidean_distance) > e_convergence and iteration_count < max_iteration: weights_gradient = update_weights(X, y, weights_start, k_step) euclidean_distance = 0 for w in xrange(len(weights_gradient[0])): euclidean_distance += (weights_gradient[-1][w] - weights_gradient[-2][w]) ** 2 iteration_count += 1 final_w = weights_gradient[-1] euclidean_distance = maxint iteration_count = 0 while math.sqrt(euclidean_distance) > e_convergence and iteration_count < max_iteration: weights_gradient_reg = update_weights_reg(X, y, weights_start, c_reg, k_step) euclidean_distance = 0 for w in xrange(len(weights_gradient_reg[0])): euclidean_distance += (weights_gradient_reg[-1][w] - weights_gradient_reg[-2][w]) ** 2 iteration_count += 1 final_w_reg = weights_gradient_reg[-1] y_scores = [sigmoid(X[i].tolist(), final_w) for i in xrange(len(X))] y_scores_reg = [sigmoid(X[i].tolist(), final_w_reg) for i in xrange(len(X))] write_submission([round(roc_auc_score(y, y_scores), 3), round(roc_auc_score(y, y_scores_reg), 3)], "81")
X, y, X_test, y_label_encoder = utils.training_data() # X, X_test = utils.extract_feature(X, X_test, "bypass", False) X_train, X_cv, y_train, y_cv, cv_indices = utils.split_data(X, y) y_train = to_categorical(y_train) y_cv = to_categorical(y_cv) y_full = to_categorical(y_cv) lb = LabelBinarizer() lb.fit(y_train) y_train = lb.transform(y_train) y_test = lb.transform(y_cv) num_classes = y_train.shape[1] X_train = np.transpose(X_train, (0, 2, 1)) X_cv = np.transpose(X_cv, (0, 2, 1)) X_test = np.transpose(X_test, (0, 2, 1)) X_full = np.transpose(X, (0, 2, 1)) X_train = X_train[..., np.newaxis] X_cv = X_cv[..., np.newaxis] X_test = X_test[..., np.newaxis] X_full = X_full[..., np.newaxis] model = load_model("trained_model.h5") y_pred_test = model.predict_classes(X_test) y_pred_labels = list(y_label_encoder.inverse_transform(y_pred_test)) utils.write_submission("neural_prediction", y_pred_labels)
def main(): input_dir = "/amit/kaggle/tgs" output_dir = "/artifacts" image_size_target = 128 batch_size = 32 epochs_to_train = 300 bce_loss_weight_gamma = 0.98 sgdr_min_lr = 0.0001 # 0.0001, 0.001 sgdr_max_lr = 0.001 # 0.001, 0.03 sgdr_cycle_epochs = 20 sgdr_cycle_epoch_prolongation = 3 sgdr_cycle_end_patience = 3 train_abort_epochs_without_improval = 30 ensemble_model_count = 3 swa_epoch_to_start = 30 model_dir = sys.argv[1] if len(sys.argv) > 1 else None train_data = TrainData(input_dir) train_set = TrainDataset(train_data.train_set_df, image_size_target, augment=True) train_set_data_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=8) val_set = TrainDataset(train_data.val_set_df, image_size_target, augment=False) val_set_data_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=2) if model_dir: model = create_model(pretrained=False).to(device) model.load_state_dict(torch.load("{}/model.pth".format(model_dir), map_location=device)) else: model = create_model(pretrained=True).to(device) torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) swa_model = create_model(pretrained=False).to(device) print("train_set_samples: %d, val_set_samples: %d" % (len(train_set), len(val_set))) global_val_precision_best_avg = float("-inf") global_swa_val_precision_best_avg = float("-inf") sgdr_cycle_val_precision_best_avg = float("-inf") epoch_iterations = len(train_set) // batch_size # optimizer = optim.SGD(model.parameters(), lr=sgdr_max_lr, weight_decay=0, momentum=0.9, nesterov=True) optimizer = optim.Adam(model.parameters(), lr=sgdr_max_lr) lr_scheduler = CosineAnnealingLR(optimizer, T_max=sgdr_cycle_epochs, eta_min=sgdr_min_lr) optim_summary_writer = SummaryWriter(log_dir="{}/logs/optim".format(output_dir)) train_summary_writer = SummaryWriter(log_dir="{}/logs/train".format(output_dir)) val_summary_writer = SummaryWriter(log_dir="{}/logs/val".format(output_dir)) swa_val_summary_writer = SummaryWriter(log_dir="{}/logs/swa_val".format(output_dir)) sgdr_iterations = 0 sgdr_reset_count = 0 batch_count = 0 epoch_of_last_improval = 0 sgdr_next_cycle_end_epoch = sgdr_cycle_epochs + sgdr_cycle_epoch_prolongation swa_update_count = 0 ensemble_model_index = 0 for model_file_path in glob.glob("{}/model-*.pth".format(output_dir)): model_file_name = os.path.basename(model_file_path) model_index = int(model_file_name.replace("model-", "").replace(".pth", "")) ensemble_model_index = max(ensemble_model_index, model_index + 1) print('{"chart": "best_val_precision", "axis": "epoch"}') print('{"chart": "val_precision", "axis": "epoch"}') print('{"chart": "val_loss", "axis": "epoch"}') print('{"chart": "sgdr_reset", "axis": "epoch"}') print('{"chart": "precision", "axis": "epoch"}') print('{"chart": "loss", "axis": "epoch"}') print('{"chart": "swa_val_precision", "axis": "epoch"}') print('{"chart": "swa_val_loss", "axis": "epoch"}') train_start_time = time.time() criterion = nn.BCEWithLogitsLoss() for epoch in range(epochs_to_train): epoch_start_time = time.time() model.train() train_loss_sum = 0.0 train_precision_sum = 0.0 train_step_count = 0 for batch in train_set_data_loader: images, masks, mask_weights = \ batch[0].to(device, non_blocking=True), \ batch[1].to(device, non_blocking=True), \ batch[2].to(device, non_blocking=True) lr_scheduler.step(epoch=min(sgdr_cycle_epochs, sgdr_iterations / epoch_iterations)) optimizer.zero_grad() prediction_logits = model(images) predictions = torch.sigmoid(prediction_logits) criterion.weight = mask_weights loss = criterion(prediction_logits, masks) loss.backward() optimizer.step() train_loss_sum += loss.item() train_precision_sum += np.mean(precision_batch(predictions, masks)) sgdr_iterations += 1 train_step_count += 1 batch_count += 1 optim_summary_writer.add_scalar("lr", get_learning_rate(optimizer), batch_count + 1) train_loss_avg = train_loss_sum / train_step_count train_precision_avg = train_precision_sum / train_step_count val_loss_avg, val_precision_avg = evaluate(model, val_set_data_loader, criterion) model_improved_within_sgdr_cycle = val_precision_avg > sgdr_cycle_val_precision_best_avg if model_improved_within_sgdr_cycle: torch.save(model.state_dict(), "{}/model-{}.pth".format(output_dir, ensemble_model_index)) sgdr_cycle_val_precision_best_avg = val_precision_avg model_improved = val_precision_avg > global_val_precision_best_avg ckpt_saved = False if model_improved: torch.save(model.state_dict(), "{}/model.pth".format(output_dir)) global_val_precision_best_avg = val_precision_avg ckpt_saved = True swa_model_improved = False if epoch + 1 >= swa_epoch_to_start: if model_improved_within_sgdr_cycle: swa_update_count += 1 moving_average(swa_model, model, 1.0 / swa_update_count) bn_update(train_set_data_loader, swa_model) swa_model_improved = val_precision_avg > global_swa_val_precision_best_avg if swa_model_improved: torch.save(swa_model.state_dict(), "{}/swa_model.pth".format(output_dir)) global_swa_val_precision_best_avg = val_precision_avg if model_improved or swa_model_improved: epoch_of_last_improval = epoch sgdr_reset = False if (epoch + 1 >= sgdr_next_cycle_end_epoch) and (epoch - epoch_of_last_improval >= sgdr_cycle_end_patience): sgdr_iterations = 0 sgdr_next_cycle_end_epoch = epoch + 1 + sgdr_cycle_epochs + sgdr_cycle_epoch_prolongation ensemble_model_index += 1 sgdr_cycle_val_precision_best_avg = float("-inf") sgdr_reset_count += 1 sgdr_reset = True swa_val_loss_avg, swa_val_precision_avg = evaluate(swa_model, val_set_data_loader, criterion) optim_summary_writer.add_scalar("sgdr_reset", sgdr_reset_count, epoch + 1) train_summary_writer.add_scalar("loss", train_loss_avg, epoch + 1) train_summary_writer.add_scalar("precision", train_precision_avg, epoch + 1) val_summary_writer.add_scalar("loss", val_loss_avg, epoch + 1) val_summary_writer.add_scalar("precision", val_precision_avg, epoch + 1) swa_val_summary_writer.add_scalar("loss", swa_val_loss_avg, epoch + 1) swa_val_summary_writer.add_scalar("precision", swa_val_precision_avg, epoch + 1) epoch_end_time = time.time() epoch_duration_time = epoch_end_time - epoch_start_time print( "[%03d/%03d] %ds, lr: %.6f, loss: %.3f, val_loss: %.3f|%.3f, prec: %.3f, val_prec: %.3f|%.3f, ckpt: %d, rst: %d" % ( epoch + 1, epochs_to_train, epoch_duration_time, get_learning_rate(optimizer), train_loss_avg, val_loss_avg, swa_val_loss_avg, train_precision_avg, val_precision_avg, swa_val_precision_avg, int(ckpt_saved), int(sgdr_reset)), flush=True) print('{"chart": "best_val_precision", "x": %d, "y": %.3f}' % (epoch + 1, global_val_precision_best_avg)) print('{"chart": "val_precision", "x": %d, "y": %.3f}' % (epoch + 1, val_precision_avg)) print('{"chart": "val_loss", "x": %d, "y": %.3f}' % (epoch + 1, val_loss_avg)) print('{"chart": "sgdr_reset", "x": %d, "y": %.3f}' % (epoch + 1, sgdr_reset_count)) print('{"chart": "precision", "x": %d, "y": %.3f}' % (epoch + 1, train_precision_avg)) print('{"chart": "loss", "x": %d, "y": %.3f}' % (epoch + 1, train_loss_avg)) print('{"chart": "swa_val_precision", "x": %d, "y": %.3f}' % (epoch + 1, swa_val_precision_avg)) print('{"chart": "swa_val_loss", "x": %d, "y": %.3f}' % (epoch + 1, swa_val_loss_avg)) if sgdr_reset and sgdr_reset_count >= ensemble_model_count and epoch - epoch_of_last_improval >= train_abort_epochs_without_improval: print("early abort") break optim_summary_writer.close() train_summary_writer.close() val_summary_writer.close() train_end_time = time.time() print() print("Train time: %s" % str(datetime.timedelta(seconds=train_end_time - train_start_time))) eval_start_time = time.time() print() print("evaluation of the training model") model.load_state_dict(torch.load("{}/model.pth".format(output_dir), map_location=device)) analyze(Ensemble([model]), train_data.val_set_df, use_tta=False) analyze(Ensemble([model]), train_data.val_set_df, use_tta=True) score_to_model = {} ensemble_model_candidates = glob.glob("{}/model-*.pth".format(output_dir)) ensemble_model_candidates.append("{}/swa_model.pth".format(output_dir)) for model_file_path in ensemble_model_candidates: model_file_name = os.path.basename(model_file_path) m = create_model(pretrained=False).to(device) m.load_state_dict(torch.load(model_file_path, map_location=device)) val_loss_avg, val_precision_avg = evaluate(m, val_set_data_loader, criterion) print("ensemble '%s': val_loss=%.3f, val_precision=%.3f" % (model_file_name, val_loss_avg, val_precision_avg)) if len(score_to_model) < ensemble_model_count or min(score_to_model.keys()) < val_precision_avg: del score_to_model[min(score_to_model.keys())] score_to_model[val_precision_avg] = m ensemble_models = list(score_to_model.values()) for ensemble_model in ensemble_models: val_loss_avg, val_precision_avg = evaluate(ensemble_model, val_set_data_loader, criterion) print("ensemble: val_loss=%.3f, val_precision=%.3f" % (val_loss_avg, val_precision_avg)) model = Ensemble(ensemble_models) mask_threshold_global, mask_threshold_per_cc = analyze(model, train_data.val_set_df, use_tta=True) eval_end_time = time.time() print() print("Eval time: %s" % str(datetime.timedelta(seconds=eval_end_time - eval_start_time))) print() print("submission preparation") submission_start_time = time.time() test_data = TestData(input_dir) calculate_predictions(test_data.df, model, use_tta=True) calculate_prediction_masks(test_data.df, mask_threshold_global) print() print(test_data.df.groupby("predictions_cc").agg({"predictions_cc": "count"})) write_submission(test_data.df, "prediction_masks", "{}/{}".format(output_dir, "submission.csv")) write_submission(test_data.df, "prediction_masks_best", "{}/{}".format(output_dir, "submission_best.csv")) submission_end_time = time.time() print() print("Submission time: %s" % str(datetime.timedelta(seconds=submission_end_time - submission_start_time)))
with open('result_statistics_cross_val_marleen.txt', mode='a+') as f: f.write( '%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.5f\t%s\t%.0f\n' % (k, rand_seed, len(feats), n_estimators, all_n, len(df_to_train) / 100000, len(df_to_test) / 100000, learning_rate, downsampling_rate, metric.calc_mean(teid_arr, terelevance_arr, tepred), balance_flag, min_samples_leaf)) results_df = df_to_test[['srch_id', 'prop_id', 'relevance']].copy() results_df['score'] = -1 * tepred # predictions = list(-1.0*predictions) recommendations = zip(results_df["srch_id"], results_df["prop_id"], results_df['relevance'], results_df['score']) utils.write_submission(recommendations, "lambdamart_test.csv") path_results = "lambdamart_test.csv" nDCG_result = nDCG.compute_ndcg(path_results) print(nDCG_result) with open('result_statistics_our_nDCG_cross_val_marleen.txt', mode='a+') as f: f.write( '%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.5f\t%s\t%.0f\n' % (k, rand_seed, len(feats), n_estimators, all_n, len(df_to_train) / 100000, len(df_to_test) / 100000, learning_rate, downsampling_rate, nDCG_result, balance_flag, min_samples_leaf)) import pickle model_save = pickle.dumps(model) new_model = pickle.loads(model_save)
print "cross validation results:" print roc_aucs ############################################################################ # fit and predict ############################################################################ result = fb_funcs.fit_and_predict(info_humans, info_bots, info_test, params=params_ens, scale='log') y_test_proba = result['y_test_proba'] ytps = result['ytps'] # feature_importances = pd.DataFrame(np.array([result['features'], # result['importances']]).T) ############################################################################ # submission file generation ############################################################################ submissionfile = 'data/submi/sub_ens.csv' testfile = 'data/test.csv' print "writing a submission file..." write_submission(y_test_proba, info_test.index, testfile, submissionfile) print "writing results from different models..." for i in range(len(ytps)): submf = 'data/submi/sub_%s.csv' %(params_ens[i]['model']) write_submission(ytps[i], info_test.index, testfile, submf)
def generate_predict(self): # Train model and create a submission # If checkpoint available do not train anew if self.checkpoint_dir: text_path = os.path.join(os.path.curdir, self.checkpoint_dir, "..", "text_vocab") self.text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore( text_path) checkpoint_file = tf.train.latest_checkpoint(self.checkpoint_dir) graph = tf.Graph() with graph.as_default(): session_conf = tf.ConfigProto( allow_soft_placement=self.allow_soft_placement, log_device_placement=self.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): # Load the saved meta graph and restore variables saver = tf.train.import_meta_graph( "{}.meta".format(checkpoint_file)) saver.restore(sess, checkpoint_file) self.session = sess # Get params predictions = graph.get_operation_by_name( "output/predictions").outputs[0] input_text = graph.get_operation_by_name( "input_text").outputs[0] dropout_keep_prob = graph.get_operation_by_name( "dropout_keep_prob").outputs[0] else: train_x, y = self.load_train() self.fit(train_x, y) # Get params predictions = self.model.predictions input_text = self.model.input_text dropout_keep_prob = self.model.dropout_keep_prob test_x = self.load_test() batches = u.batch_iter(list(test_x), self.batch_size, 1, shuffle=False) print("Creating submission") all_predictions = [] # Collect the predictions here for x_batch in batches: batch_predictions = self.session.run(predictions, { input_text: x_batch, dropout_keep_prob: 1.0 }) all_predictions = np.concatenate( [all_predictions, batch_predictions]) # Map to correct labels preds = np.asarray( [-1 if pred == 0 else pred for pred in all_predictions]) u.write_submission(preds.astype(int), self.subm + self.name + "_submission.csv") print("Submission successfully created")
def pandas_assignment(): data = pd.read_csv("data/titanic.csv", index_col="PassengerId") subm11 = data["Sex"].value_counts() write_submission(" ".join([str(x) for x in subm11]), "11") write_submission(int(data["Survived"].value_counts(normalize=True).to_dict()[1] * 100), "12") write_submission(int(data["Pclass"].value_counts(normalize=True).to_dict()[1] * 100), "13") subm14 = [] subm14.append(round(float(data["Age"].mean()), 1)) subm14.append(int(data["Age"].median())) write_submission(" ".join([str(x) for x in subm14]), "14") write_submission(round(data.corr("pearson")["SibSp"]["Parch"], 2), "15") write_submission( data["Name"] .str.extract("(Miss\. |Mrs\.[A-Za-z ]*\()([A-Za-z]*)")[1] .value_counts() .head(n=1) .to_string() .split(" ", 1)[0], "16", )