Example #1
0
def main():
    data = pd.read_csv('data/svm-data.csv', header=None)
    X = data.iloc[:, 1:].values
    y = data.iloc[:, 0].values
    clf = SVC(C=100000, random_state=241)
    clf.fit(X, y)
    result = str([x+1 for x in clf.support_])
    write_submission(result, '61')
Example #2
0
def main():
    model = utils.load_model()
    
    valid_df = utils.get_valid_df()
    
    predictions = model.predict(valid_df)
    predictions = predictions.reshape(len(predictions), 1)
    
    utils.write_submission(predictions)
Example #3
0
def main():
    boston = load_boston()
    X = scale(boston.data)
    y = boston.target
    classifier = KNeighborsRegressor(n_neighbors=5, weights='distance')
    kf = KFold(len(X), n_folds=5, shuffle=True, random_state=42)
    neighbors_range = np.linspace(1.0, 10.0, num=200)

    write_submission(
        int(classifier_choice_cv(
            X, y, classifier, 'p', neighbors_range, kf,
            scoring='mean_squared_error')[0]),
        '41')
Example #4
0
def main():
    wine_file = 'data/wine.data'
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
    try:
        data = pd.read_csv(wine_file, header=None)
    except IOError:
        print ('No {0} file found, downloading it from {1}'
               .format(wine_file, url))
        wine_reques = requests.get(url).content
        data = pd.read_csv(io.StringIO(wine_reques.decode('utf-8')),
                           header=None)

    X = data.iloc[:, 1:].values
    X_scaled = scale(X)
    y = data.iloc[:, 0].values
    kf = KFold(len(data.index), n_folds=5, shuffle=True, random_state=42)
    neighbors_range = [x for x in range(1, 51)]

    write_submission(
        classifier_choice_cv(
            X, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[0],
        '31')
    write_submission(
        classifier_choice_cv(
            X, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[1],
        '32')
    write_submission(
        classifier_choice_cv(
            X_scaled, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[0],
        '33')
    write_submission(
        classifier_choice_cv(
            X_scaled, y, KNeighborsClassifier, 'n_neighbors', neighbors_range, kf)[1],
        '34')
    def generate_predict(self):
        # Train model and create a submission
        if not self.hasTrained:
            train_x, y = self.load_train()
            self.fit(train_x, y)
            self.hasTrained = True

        print("Creating submission")
        test_features = self.load_test()
        preds = self.compute_predict(test_features)
        preds = self.transform(preds)

        u.write_submission(preds.astype(int), self.subm + self.name + "_submission.csv")
        print("Submission successfully created")
Example #6
0
def trees_assignment():
    data = pd.read_csv("data/titanic.csv", index_col="PassengerId")
    data21 = data.dropna(subset=["Pclass", "Fare", "Age", "Survived", "Sex"])
    pd.options.mode.chained_assignment = None  # suppress false positive warn
    data21["Sex"] = data21["Sex"].map({"female": 0, "male": 1})
    pd.options.mode.chained_assignment = "warn"  # turning warn back
    feature_names = ["Pclass", "Fare", "Age", "Sex"]
    X = data21[feature_names]
    y = data21[["Survived"]]

    clf = DecisionTreeClassifier(random_state=241)
    clf.fit(X, y)
    importances = clf.feature_importances_
    indices = np.argsort(importances)[::-1]
    subm21 = [feature_names[f] for f in indices][:2]

    write_submission([x for x in subm21], "21")
Example #7
0
def main():
    data = {}
    scaler = StandardScaler()

    for data_type in ['train', 'test']:
        df = pd.read_csv('data/perceptron-{0}.csv'.format(data_type),
                         header=None)
        data['X_' + data_type] = df.iloc[:, 1:].values
        data['y_' + data_type] = df.iloc[:, 0].values

    data['X_train_scaled'] = scaler.fit_transform(data['X_train'])
    data['X_test_scaled'] = scaler.transform(data['X_test'])

    acc = get_accuracy(data['X_train'], data['y_train'],
                       data['X_test'], data['y_test'],)
    acc_scaled = get_accuracy(data['X_train_scaled'], data['y_train'],
                              data['X_test_scaled'], data['y_test'])

    write_submission(round(abs(acc - acc_scaled), 3), '51')
Example #8
0
def main():
    if not os.path.exists(ckpt_path + 'checkpoint'):
        print('there is not saved model, please check the ckpt path')
        exit()
    print('Loading model...')
    W_embedding = np.load(embedding_path)
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        model = network.TextCNN(W_embedding, settings)
        model.saver.restore(sess, tf.train.latest_checkpoint(ckpt_path))
        #print('Local predicting...')
        #print ('valid batches:%d'%n_va_batches)
        #local_predict(sess, model)
        print('Test predicting...')
        print('test batches:%d' % n_tr_batches)
        results = predict(sess, model)
        sub_path_name = sub_path + model_name + str(
            strftime("%m%d%H%M")) + '.csv'
        id_list = get_id_list(id_list_path)
        #         id_list=id_list[:len(results)]
        write_submission(sub_path_name, id_list, results, sr_id2title)
Example #9
0
def main():
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')
    newsgroups = fetch_20newsgroups(subset='all',
                                    categories=['alt.atheism', 'sci.space'])

    vectorizer = TfidfVectorizer()
    X = newsgroups.target
    y = newsgroups.data
    X_train = vectorizer.fit_transform(y)
    grid = {'C': np.power(10.0, np.arange(-5, 6))}
    cv = KFold(X_train.shape[0], n_folds=5, shuffle=True, random_state=241)
    clf = SVC(kernel='linear', random_state=241)
    gs = GridSearchCV(clf, grid, scoring='accuracy', cv=cv)

    gs.fit(X_train, X)
    clf.set_params(**gs.best_params_)
    clf.fit(X_train, X)
    result = (show_top10(clf, vectorizer))
    result.sort()
    write_submission(str(
            [x for x in result]).lower().encode('ascii', 'ignore'),
        '71')  # still need some work to get rid of unicode problem
Example #10
0
def main():
    data = np.genfromtxt("data/data-logistic.csv", delimiter=",")
    y = data[:, 0]
    X = data[:, 1:]
    e_convergence = 0.00001
    max_iteration = 10000
    c_reg = 10
    k_step = 0.1
    weights_start = [[0.0, 0.0]]

    euclidean_distance = maxint
    iteration_count = 0
    while math.sqrt(euclidean_distance) > e_convergence and iteration_count < max_iteration:
        weights_gradient = update_weights(X, y, weights_start, k_step)

        euclidean_distance = 0
        for w in xrange(len(weights_gradient[0])):
            euclidean_distance += (weights_gradient[-1][w] - weights_gradient[-2][w]) ** 2
        iteration_count += 1
    final_w = weights_gradient[-1]

    euclidean_distance = maxint
    iteration_count = 0
    while math.sqrt(euclidean_distance) > e_convergence and iteration_count < max_iteration:
        weights_gradient_reg = update_weights_reg(X, y, weights_start, c_reg, k_step)

        euclidean_distance = 0
        for w in xrange(len(weights_gradient_reg[0])):
            euclidean_distance += (weights_gradient_reg[-1][w] - weights_gradient_reg[-2][w]) ** 2
        iteration_count += 1
    final_w_reg = weights_gradient_reg[-1]

    y_scores = [sigmoid(X[i].tolist(), final_w) for i in xrange(len(X))]
    y_scores_reg = [sigmoid(X[i].tolist(), final_w_reg) for i in xrange(len(X))]

    write_submission([round(roc_auc_score(y, y_scores), 3), round(roc_auc_score(y, y_scores_reg), 3)], "81")
X, y, X_test, y_label_encoder = utils.training_data()
# X, X_test = utils.extract_feature(X, X_test, "bypass", False)

X_train, X_cv, y_train, y_cv, cv_indices = utils.split_data(X, y)
y_train = to_categorical(y_train)
y_cv = to_categorical(y_cv)
y_full = to_categorical(y_cv)

lb = LabelBinarizer()
lb.fit(y_train)
y_train = lb.transform(y_train)
y_test = lb.transform(y_cv)
num_classes = y_train.shape[1]

X_train = np.transpose(X_train, (0, 2, 1))
X_cv = np.transpose(X_cv, (0, 2, 1))
X_test = np.transpose(X_test, (0, 2, 1))

X_full = np.transpose(X, (0, 2, 1))

X_train = X_train[..., np.newaxis]
X_cv = X_cv[..., np.newaxis]
X_test = X_test[..., np.newaxis]
X_full = X_full[..., np.newaxis]

model = load_model("trained_model.h5")
y_pred_test = model.predict_classes(X_test)

y_pred_labels = list(y_label_encoder.inverse_transform(y_pred_test))
utils.write_submission("neural_prediction", y_pred_labels)
Example #12
0
def main():
    input_dir = "/amit/kaggle/tgs"
    output_dir = "/artifacts"
    image_size_target = 128
    batch_size = 32
    epochs_to_train = 300
    bce_loss_weight_gamma = 0.98
    sgdr_min_lr = 0.0001  # 0.0001, 0.001
    sgdr_max_lr = 0.001  # 0.001, 0.03
    sgdr_cycle_epochs = 20
    sgdr_cycle_epoch_prolongation = 3
    sgdr_cycle_end_patience = 3
    train_abort_epochs_without_improval = 30
    ensemble_model_count = 3
    swa_epoch_to_start = 30

    model_dir = sys.argv[1] if len(sys.argv) > 1 else None

    train_data = TrainData(input_dir)

    train_set = TrainDataset(train_data.train_set_df, image_size_target, augment=True)
    train_set_data_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=8)

    val_set = TrainDataset(train_data.val_set_df, image_size_target, augment=False)
    val_set_data_loader = DataLoader(val_set, batch_size=batch_size, shuffle=False, num_workers=2)

    if model_dir:
        model = create_model(pretrained=False).to(device)
        model.load_state_dict(torch.load("{}/model.pth".format(model_dir), map_location=device))
    else:
        model = create_model(pretrained=True).to(device)

    torch.save(model.state_dict(), "{}/model.pth".format(output_dir))

    swa_model = create_model(pretrained=False).to(device)

    print("train_set_samples: %d, val_set_samples: %d" % (len(train_set), len(val_set)))

    global_val_precision_best_avg = float("-inf")
    global_swa_val_precision_best_avg = float("-inf")
    sgdr_cycle_val_precision_best_avg = float("-inf")

    epoch_iterations = len(train_set) // batch_size

    # optimizer = optim.SGD(model.parameters(), lr=sgdr_max_lr, weight_decay=0, momentum=0.9, nesterov=True)
    optimizer = optim.Adam(model.parameters(), lr=sgdr_max_lr)
    lr_scheduler = CosineAnnealingLR(optimizer, T_max=sgdr_cycle_epochs, eta_min=sgdr_min_lr)

    optim_summary_writer = SummaryWriter(log_dir="{}/logs/optim".format(output_dir))
    train_summary_writer = SummaryWriter(log_dir="{}/logs/train".format(output_dir))
    val_summary_writer = SummaryWriter(log_dir="{}/logs/val".format(output_dir))
    swa_val_summary_writer = SummaryWriter(log_dir="{}/logs/swa_val".format(output_dir))

    sgdr_iterations = 0
    sgdr_reset_count = 0
    batch_count = 0
    epoch_of_last_improval = 0
    sgdr_next_cycle_end_epoch = sgdr_cycle_epochs + sgdr_cycle_epoch_prolongation
    swa_update_count = 0

    ensemble_model_index = 0
    for model_file_path in glob.glob("{}/model-*.pth".format(output_dir)):
        model_file_name = os.path.basename(model_file_path)
        model_index = int(model_file_name.replace("model-", "").replace(".pth", ""))
        ensemble_model_index = max(ensemble_model_index, model_index + 1)

    print('{"chart": "best_val_precision", "axis": "epoch"}')
    print('{"chart": "val_precision", "axis": "epoch"}')
    print('{"chart": "val_loss", "axis": "epoch"}')
    print('{"chart": "sgdr_reset", "axis": "epoch"}')
    print('{"chart": "precision", "axis": "epoch"}')
    print('{"chart": "loss", "axis": "epoch"}')
    print('{"chart": "swa_val_precision", "axis": "epoch"}')
    print('{"chart": "swa_val_loss", "axis": "epoch"}')

    train_start_time = time.time()

    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(epochs_to_train):
        epoch_start_time = time.time()
        model.train()

        train_loss_sum = 0.0
        train_precision_sum = 0.0
        train_step_count = 0
        for batch in train_set_data_loader:
            images, masks, mask_weights = \
                batch[0].to(device, non_blocking=True), \
                batch[1].to(device, non_blocking=True), \
                batch[2].to(device, non_blocking=True)

            lr_scheduler.step(epoch=min(sgdr_cycle_epochs, sgdr_iterations / epoch_iterations))

            optimizer.zero_grad()
            prediction_logits = model(images)
            predictions = torch.sigmoid(prediction_logits)
            criterion.weight = mask_weights
            loss = criterion(prediction_logits, masks)
            loss.backward()
            optimizer.step()

            train_loss_sum += loss.item()
            train_precision_sum += np.mean(precision_batch(predictions, masks))
            sgdr_iterations += 1
            train_step_count += 1
            batch_count += 1

            optim_summary_writer.add_scalar("lr", get_learning_rate(optimizer), batch_count + 1)

        train_loss_avg = train_loss_sum / train_step_count
        train_precision_avg = train_precision_sum / train_step_count

        val_loss_avg, val_precision_avg = evaluate(model, val_set_data_loader, criterion)

        model_improved_within_sgdr_cycle = val_precision_avg > sgdr_cycle_val_precision_best_avg
        if model_improved_within_sgdr_cycle:
            torch.save(model.state_dict(), "{}/model-{}.pth".format(output_dir, ensemble_model_index))
            sgdr_cycle_val_precision_best_avg = val_precision_avg

        model_improved = val_precision_avg > global_val_precision_best_avg
        ckpt_saved = False
        if model_improved:
            torch.save(model.state_dict(), "{}/model.pth".format(output_dir))
            global_val_precision_best_avg = val_precision_avg
            ckpt_saved = True

        swa_model_improved = False
        if epoch + 1 >= swa_epoch_to_start:
            if model_improved_within_sgdr_cycle:
                swa_update_count += 1
                moving_average(swa_model, model, 1.0 / swa_update_count)
                bn_update(train_set_data_loader, swa_model)

            swa_model_improved = val_precision_avg > global_swa_val_precision_best_avg
            if swa_model_improved:
                torch.save(swa_model.state_dict(), "{}/swa_model.pth".format(output_dir))
                global_swa_val_precision_best_avg = val_precision_avg

        if model_improved or swa_model_improved:
            epoch_of_last_improval = epoch

        sgdr_reset = False
        if (epoch + 1 >= sgdr_next_cycle_end_epoch) and (epoch - epoch_of_last_improval >= sgdr_cycle_end_patience):
            sgdr_iterations = 0
            sgdr_next_cycle_end_epoch = epoch + 1 + sgdr_cycle_epochs + sgdr_cycle_epoch_prolongation
            ensemble_model_index += 1
            sgdr_cycle_val_precision_best_avg = float("-inf")
            sgdr_reset_count += 1
            sgdr_reset = True

        swa_val_loss_avg, swa_val_precision_avg = evaluate(swa_model, val_set_data_loader, criterion)

        optim_summary_writer.add_scalar("sgdr_reset", sgdr_reset_count, epoch + 1)

        train_summary_writer.add_scalar("loss", train_loss_avg, epoch + 1)
        train_summary_writer.add_scalar("precision", train_precision_avg, epoch + 1)

        val_summary_writer.add_scalar("loss", val_loss_avg, epoch + 1)
        val_summary_writer.add_scalar("precision", val_precision_avg, epoch + 1)

        swa_val_summary_writer.add_scalar("loss", swa_val_loss_avg, epoch + 1)
        swa_val_summary_writer.add_scalar("precision", swa_val_precision_avg, epoch + 1)

        epoch_end_time = time.time()
        epoch_duration_time = epoch_end_time - epoch_start_time

        print(
            "[%03d/%03d] %ds, lr: %.6f, loss: %.3f, val_loss: %.3f|%.3f, prec: %.3f, val_prec: %.3f|%.3f, ckpt: %d, rst: %d" % (
                epoch + 1,
                epochs_to_train,
                epoch_duration_time,
                get_learning_rate(optimizer),
                train_loss_avg,
                val_loss_avg,
                swa_val_loss_avg,
                train_precision_avg,
                val_precision_avg,
                swa_val_precision_avg,
                int(ckpt_saved),
                int(sgdr_reset)),
            flush=True)

        print('{"chart": "best_val_precision", "x": %d, "y": %.3f}' % (epoch + 1, global_val_precision_best_avg))
        print('{"chart": "val_precision", "x": %d, "y": %.3f}' % (epoch + 1, val_precision_avg))
        print('{"chart": "val_loss", "x": %d, "y": %.3f}' % (epoch + 1, val_loss_avg))
        print('{"chart": "sgdr_reset", "x": %d, "y": %.3f}' % (epoch + 1, sgdr_reset_count))
        print('{"chart": "precision", "x": %d, "y": %.3f}' % (epoch + 1, train_precision_avg))
        print('{"chart": "loss", "x": %d, "y": %.3f}' % (epoch + 1, train_loss_avg))
        print('{"chart": "swa_val_precision", "x": %d, "y": %.3f}' % (epoch + 1, swa_val_precision_avg))
        print('{"chart": "swa_val_loss", "x": %d, "y": %.3f}' % (epoch + 1, swa_val_loss_avg))

        if sgdr_reset and sgdr_reset_count >= ensemble_model_count and epoch - epoch_of_last_improval >= train_abort_epochs_without_improval:
            print("early abort")
            break

    optim_summary_writer.close()
    train_summary_writer.close()
    val_summary_writer.close()

    train_end_time = time.time()
    print()
    print("Train time: %s" % str(datetime.timedelta(seconds=train_end_time - train_start_time)))

    eval_start_time = time.time()

    print()
    print("evaluation of the training model")

    model.load_state_dict(torch.load("{}/model.pth".format(output_dir), map_location=device))

    analyze(Ensemble([model]), train_data.val_set_df, use_tta=False)
    analyze(Ensemble([model]), train_data.val_set_df, use_tta=True)

    score_to_model = {}
    ensemble_model_candidates = glob.glob("{}/model-*.pth".format(output_dir))
    ensemble_model_candidates.append("{}/swa_model.pth".format(output_dir))
    for model_file_path in ensemble_model_candidates:
        model_file_name = os.path.basename(model_file_path)
        m = create_model(pretrained=False).to(device)
        m.load_state_dict(torch.load(model_file_path, map_location=device))
        val_loss_avg, val_precision_avg = evaluate(m, val_set_data_loader, criterion)
        print("ensemble '%s': val_loss=%.3f, val_precision=%.3f" % (model_file_name, val_loss_avg, val_precision_avg))
        if len(score_to_model) < ensemble_model_count or min(score_to_model.keys()) < val_precision_avg:
            del score_to_model[min(score_to_model.keys())]
            score_to_model[val_precision_avg] = m

    ensemble_models = list(score_to_model.values())
    for ensemble_model in ensemble_models:
        val_loss_avg, val_precision_avg = evaluate(ensemble_model, val_set_data_loader, criterion)
        print("ensemble: val_loss=%.3f, val_precision=%.3f" % (val_loss_avg, val_precision_avg))

    model = Ensemble(ensemble_models)
    mask_threshold_global, mask_threshold_per_cc = analyze(model, train_data.val_set_df, use_tta=True)

    eval_end_time = time.time()
    print()
    print("Eval time: %s" % str(datetime.timedelta(seconds=eval_end_time - eval_start_time)))

    print()
    print("submission preparation")

    submission_start_time = time.time()

    test_data = TestData(input_dir)
    calculate_predictions(test_data.df, model, use_tta=True)
    calculate_prediction_masks(test_data.df, mask_threshold_global)

    print()
    print(test_data.df.groupby("predictions_cc").agg({"predictions_cc": "count"}))

    write_submission(test_data.df, "prediction_masks", "{}/{}".format(output_dir, "submission.csv"))
    write_submission(test_data.df, "prediction_masks_best", "{}/{}".format(output_dir, "submission_best.csv"))

    submission_end_time = time.time()
    print()
    print("Submission time: %s" % str(datetime.timedelta(seconds=submission_end_time - submission_start_time)))
    with open('result_statistics_cross_val_marleen.txt', mode='a+') as f:
        f.write(
            '%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.5f\t%s\t%.0f\n' %
            (k, rand_seed,
             len(feats), n_estimators, all_n, len(df_to_train) / 100000,
             len(df_to_test) / 100000, learning_rate, downsampling_rate,
             metric.calc_mean(teid_arr, terelevance_arr,
                              tepred), balance_flag, min_samples_leaf))

    results_df = df_to_test[['srch_id', 'prop_id', 'relevance']].copy()
    results_df['score'] = -1 * tepred

    # predictions = list(-1.0*predictions)
    recommendations = zip(results_df["srch_id"], results_df["prop_id"],
                          results_df['relevance'], results_df['score'])
    utils.write_submission(recommendations, "lambdamart_test.csv")

    path_results = "lambdamart_test.csv"
    nDCG_result = nDCG.compute_ndcg(path_results)
    print(nDCG_result)
    with open('result_statistics_our_nDCG_cross_val_marleen.txt',
              mode='a+') as f:
        f.write(
            '%s\t%s\t%s\t%s\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.5f\t%s\t%.0f\n' %
            (k, rand_seed, len(feats), n_estimators, all_n, len(df_to_train) /
             100000, len(df_to_test) / 100000, learning_rate,
             downsampling_rate, nDCG_result, balance_flag, min_samples_leaf))

    import pickle
    model_save = pickle.dumps(model)
    new_model = pickle.loads(model_save)
Example #14
0
print "cross validation results:"
print roc_aucs

############################################################################
# fit and predict
############################################################################


result = fb_funcs.fit_and_predict(info_humans, info_bots, info_test,
                                  params=params_ens, scale='log')
y_test_proba = result['y_test_proba']
ytps = result['ytps']

# feature_importances = pd.DataFrame(np.array([result['features'],
                                             # result['importances']]).T)

############################################################################
# submission file generation
############################################################################
submissionfile = 'data/submi/sub_ens.csv'
testfile = 'data/test.csv'

print "writing a submission file..."
write_submission(y_test_proba, info_test.index, testfile, submissionfile)

print "writing results from different models..."
for i in range(len(ytps)):
    submf = 'data/submi/sub_%s.csv' %(params_ens[i]['model'])
    write_submission(ytps[i], info_test.index, testfile, submf)
Example #15
0
    def generate_predict(self):
        # Train model and create a submission

        # If checkpoint available do not train anew
        if self.checkpoint_dir:
            text_path = os.path.join(os.path.curdir, self.checkpoint_dir, "..",
                                     "text_vocab")
            self.text_vocab_processor = tf.contrib.learn.preprocessing.VocabularyProcessor.restore(
                text_path)

            checkpoint_file = tf.train.latest_checkpoint(self.checkpoint_dir)

            graph = tf.Graph()
            with graph.as_default():
                session_conf = tf.ConfigProto(
                    allow_soft_placement=self.allow_soft_placement,
                    log_device_placement=self.log_device_placement)
                sess = tf.Session(config=session_conf)
                with sess.as_default():
                    # Load the saved meta graph and restore variables
                    saver = tf.train.import_meta_graph(
                        "{}.meta".format(checkpoint_file))
                    saver.restore(sess, checkpoint_file)
                self.session = sess
                # Get params
                predictions = graph.get_operation_by_name(
                    "output/predictions").outputs[0]
                input_text = graph.get_operation_by_name(
                    "input_text").outputs[0]
                dropout_keep_prob = graph.get_operation_by_name(
                    "dropout_keep_prob").outputs[0]

        else:
            train_x, y = self.load_train()
            self.fit(train_x, y)
            # Get params
            predictions = self.model.predictions
            input_text = self.model.input_text
            dropout_keep_prob = self.model.dropout_keep_prob

        test_x = self.load_test()
        batches = u.batch_iter(list(test_x), self.batch_size, 1, shuffle=False)

        print("Creating submission")
        all_predictions = []
        # Collect the predictions here
        for x_batch in batches:
            batch_predictions = self.session.run(predictions, {
                input_text: x_batch,
                dropout_keep_prob: 1.0
            })
            all_predictions = np.concatenate(
                [all_predictions, batch_predictions])

        # Map to correct labels
        preds = np.asarray(
            [-1 if pred == 0 else pred for pred in all_predictions])

        u.write_submission(preds.astype(int),
                           self.subm + self.name + "_submission.csv")
        print("Submission successfully created")
Example #16
0
def pandas_assignment():
    data = pd.read_csv("data/titanic.csv", index_col="PassengerId")

    subm11 = data["Sex"].value_counts()
    write_submission(" ".join([str(x) for x in subm11]), "11")

    write_submission(int(data["Survived"].value_counts(normalize=True).to_dict()[1] * 100), "12")

    write_submission(int(data["Pclass"].value_counts(normalize=True).to_dict()[1] * 100), "13")

    subm14 = []
    subm14.append(round(float(data["Age"].mean()), 1))
    subm14.append(int(data["Age"].median()))
    write_submission(" ".join([str(x) for x in subm14]), "14")

    write_submission(round(data.corr("pearson")["SibSp"]["Parch"], 2), "15")

    write_submission(
        data["Name"]
        .str.extract("(Miss\. |Mrs\.[A-Za-z ]*\()([A-Za-z]*)")[1]
        .value_counts()
        .head(n=1)
        .to_string()
        .split(" ", 1)[0],
        "16",
    )