Ejemplo n.º 1
0
def predict_dots(clients, target, model, factor=1):
    """Predict targets for clients using model and specified dot factor"""

    print("Predicting target[", target, "] using factor:", factor)

    features = clients[0].dots[factor - 1][0].get_cat_features()

    train_data = []
    train_labels = [[], []]
    dot_clients = []
    dot_dot = []
    for c in range(len(clients)):
        client = clients[c]
        client.predicted_target[target] = [.0, .0]
        client.predicted_target_probability[target] = .0
        client.best_dot[target][factor - 1] = None
        client.best_dot_proba[target][factor - 1] = .0
        for dot in client.dots[factor - 1]:
            data, labels = dot.get_data()
            train_data.append(data)
            train_labels[0].append(labels[0])
            train_labels[1].append(labels[1])
            dot_clients.append(client)
            dot_dot.append(dot)
    train_pool = cb.Pool(np.array(train_data), np.array(train_labels[target]),
                         features)
    results = model.predict_proba(train_pool)
    for i in range(len(results)):
        if dot_clients[i].predicted_target_probability[target] < results[i][1]:
            dot_clients[i].predicted_target_probability[target] = results[i][1]
            dot_clients[i].predicted_target[target] = [
                float(train_data[i][0]),
                float(train_data[i][1])
            ]  #demap_coord([train_data[i][0], train_data[i][1]])
            dot_clients[i].best_dot[target][factor - 1] = dot_dot[i]
            dot_clients[i].best_dot_proba[target][factor - 1] = results[i][1]
    fold_matches = 0.0
    for c in range(len(clients)):
        client = clients[c]
        if client.target[target][0] != .0 and client.target[target][1] != .0 and \
            client.predicted_target[target][0] != .0 and client.predicted_target[target][1] != .0 and \
            distance_ss(client.target[target], client.predicted_target[target]) < 0.02:
            fold_matches += 1
    print("Result on all clients:", fold_matches / len(clients))
    print("Feature importance", model.get_feature_importance(train_pool))
Ejemplo n.º 2
0
def predict_selector(clients, target, model):
    """Predict using selector for targets for clients using model and specified dot factor"""

    print("Predicting selector for target[", target, "] ")

    features = clients[0].get_cat_features()

    train_data = []
    train_labels = [[], []]
    dot_clients = []
    for c in range(len(clients)):
        client = clients[c]
        client.predicted_target[target] = [.0, .0]
        client.predicted_target_probability[target] = .0
        has_ndots = True
        for fctr in range(1, len(client.dots)):
            if len(client.dots[fctr]) == 0:
                has_ndots = has_ndots and False
        if has_ndots:
            data, labels = client.get_data()
            train_data.append(data)
            train_labels[0].append(labels[0])
            train_labels[1].append(labels[1])
            dot_clients.append(client)
    train_pool = cb.Pool(np.array(train_data), np.array(train_labels[target]),
                         features)
    results = model.predict(train_pool)
    for i in range(len(results)):
        dot_clients[i].best_model[target] = dot_clients[
            i].best_model_from_label(target, results[i][0])
    fold_matches = 0.0
    for c in range(len(clients)):
        client = clients[c]
        if client.target[target][0] != .0 and client.target[target][1] != .0 and \
            client.best_dot[target][client.best_model[target]].coords[0] != .0 and client.best_dot[target][client.best_model[target]].coords[1] != .0 and \
            distance_ss(client.target[target], client.best_dot[target][client.best_model[target]].coords) < 0.02:
            fold_matches += 1
    print("Result on all clients:", fold_matches / len(clients))
    print("Feature importance", model.get_feature_importance(train_pool))
Ejemplo n.º 3
0
def train_dots(clients, target, factor=1, _model=None, fold_num=5):

    print("Training target[", target, "] using factor:", factor)

    folds = [[[] for i in range(fold_num)] for j in range(fold_num)]
    clients_num = len(clients)
    model = _model
    if model == None:
        model = cb.CatBoostClassifier(
            iterations=100,
            depth=4,
            learning_rate=0.04,
            #custom_loss=['Recall', 'Precision', 'Accuracy'],
            #loss_function='Logloss',
            random_seed=4242,
            use_best_model=True,
            #eval_metric='Logloss',
            #task_type='GPU',
            logging_level='Verbose')

    features = clients[0].dots[factor - 1][0].get_cat_features()

    for c in range(len(clients)):
        client = clients[c]
        client.predicted_target[target] = [.0, .0]
        client.predicted_target_probability[target] = .0

    if fold_num > 0:
        sub_fold_num = clients_num // fold_num
        for i in range(fold_num):
            for j in range(fold_num):
                print(
                    i * clients_num // fold_num + 0 +
                    j * sub_fold_num // fold_num, i * clients_num // fold_num +
                    (1 + j) * sub_fold_num // fold_num)
                folds[i][j] = clients[i * clients_num // fold_num + 0 +
                                      j * sub_fold_num // fold_num:i *
                                      clients_num // fold_num +
                                      (1 + j) * sub_fold_num // fold_num]

        total_results = []
        for f in range(len(folds)):
            fold_results = []
            for s in range(len(folds[f])):
                train_data = []
                train_labels = [[], []]
                eval_data = []
                eval_labels = [[], []]
                dot_clients = []
                for c in range(len(folds[f][s])):
                    client = folds[f][s][c]
                    if client.target[target][0] != .0 and client.target[
                            target][1] != .0:
                        for dot in client.dots[factor - 1]:
                            data, labels = dot.get_data()
                            eval_data.append(data)
                            eval_labels[0].append(labels[0])
                            eval_labels[1].append(labels[1])
                            dot_clients.append(client)
                for t in range(len(folds[f])):
                    if t == s:
                        continue
                    for c in range(len(folds[f][t])):
                        client = folds[f][t][c]
                        if client.target[target][0] != .0 and client.target[
                                target][1] != .0:
                            for dot in client.dots[factor - 1]:
                                data, labels = dot.get_data()
                                train_data.append(data)
                                train_labels[0].append(labels[0])
                                train_labels[1].append(labels[1])
                fold_train_pool = cb.Pool(np.array(train_data),
                                          np.array(train_labels[target]),
                                          features)
                fold_eval_pool = cb.Pool(np.array(eval_data),
                                         np.array(eval_labels[target]),
                                         features)
                model.fit(fold_train_pool, eval_set=fold_eval_pool)
                results = model.predict_proba(fold_eval_pool)
                for i in range(len(results)):
                    if dot_clients[i].predicted_target_probability[
                            target] < results[i][1]:
                        dot_clients[i].predicted_target_probability[
                            target] = results[i][1]
                        dot_clients[i].predicted_target[target] = [
                            float(eval_data[i][0]),
                            float(eval_data[i][1])
                        ]  #demap_coord([eval_data[i][0], eval_data[i][1]])
                fold_matches = 0.0
                for c in range(len(folds[f][s])):
                    client = folds[f][s][c]
                    if client.target[target][0] != .0 and client.target[target][1] != .0 and \
                        client.predicted_target[target][0] != .0 and client.predicted_target[target][1] != .0 and \
                         distance_ss(client.target[target], client.predicted_target[target]) < 0.02:
                        fold_matches += 1
                fold_results.append(fold_matches / len(folds[f][s]))
            print("Train subfold results:", fold_results)
            total = 0.0
            for s in range(len(folds[f])):
                total += fold_results[s]
            total_results.append(total / len(folds[f]))

        print("Train fold results:", total_results)
        res = 0.0
        for i in total_results:
            res += i
        total_results = res / len(folds)

        print("Train total result:", total_results)

    predict_dots(clients, target, model, factor=factor)

    return model
Ejemplo n.º 4
0
def main3():
    random.seed(4242)
    clients = client3.load_clients(TRAIN_CLIENTS_PICKLE, TRAIN_ROWS_PICKLE,
                                   TRAIN_CSV)
    #clients = client3.load_clients(TRAIN_CLIENTS_PICKLE+"_1000", TRAIN_ROWS_PICKLE, TRAIN_CSV)

    clients, targets = client3.fetch(
        cls=clients[:5000],
        _targets=None,
        max_factor=MAX_FACTOR,
        clients_pickle_file=TRAIN_CLIENTS_PICKLE,  #+"_1000",
        rows_pickle_file=TRAIN_ROWS_PICKLE,
        csv_file=TRAIN_CSV,
        parallel=True)

    #client3.plot_client_dots_features(clients)

    models = [[None for i in range(MAX_FACTOR)] for t in range(2)]
    selector = [None for t in range(2)]

    for t in range(2):
        for i in range(MAX_FACTOR):
            models[t][i] = cb.CatBoostClassifier(
                iterations=2000,
                depth=4,
                learning_rate=0.05,
                # custom_loss=['Recall', 'Precision', 'Accuracy'],
                # loss_function='Logloss',
                random_seed=4242,
                use_best_model=True,
                od_type='Iter',
                od_wait=500,
                # eval_metric='Logloss',
                # task_type='GPU',
                #logging_level='Verbose',
                logging_level='Silent')
        selector[t] = cb.CatBoostClassifier(
            iterations=1000,
            depth=4,
            learning_rate=0.05,
            # custom_loss=['Recall', 'Precision', 'Accuracy'],
            # loss_function='Logloss',
            loss_function='MultiClass',
            classes_count=32,
            random_seed=4242,
            use_best_model=True,
            od_type='Iter',
            od_wait=500,
            # eval_metric='Logloss',
            # task_type='GPU',
            logging_level='Verbose',
            #logging_level='Silent'
        )

    for t in range(2):
        for i in range(MAX_FACTOR):
            if os.path.isfile("dots_model_" + str(t) + "_" + str(i)):
                models[t][i].load_model(fname="dots_model_" + str(t) + "_" +
                                        str(i))
            else:
                models[t][i] = predictor3.train_dots(clients,
                                                     _model=models[t][i],
                                                     target=t,
                                                     factor=i + 1)
                models[t][i].save_model("dots_model_" + str(t) + "_" + str(i),
                                        format="cbm")

    for t in range(2):
        for i in range(MAX_FACTOR):
            predictor3.predict_dots(clients,
                                    target=t,
                                    model=models[t][i],
                                    factor=i + 1)

    # client3.plot_best_dot_probabilities(clients)
    # sys.exit()

    for t in range(2):
        if os.path.isfile("selector_" + str(t)):
            selector[t].load_model(fname="selector_" + str(t))
        else:
            selector[t] = predictor3.train_selector(clients,
                                                    _model=selector[t],
                                                    target=t)

        selector[t].save_model("selector_" + str(t), format="cbm")

    for t in range(2):
        for i in range(MAX_FACTOR):
            predictor3.predict_dots(clients,
                                    target=t,
                                    model=models[t][i],
                                    factor=i + 1)
        predictor3.predict_selector(clients, target=t, model=selector[t])

    fold_matches = 0
    none_count = 0
    for t in range(2):
        for c in range(len(clients)):
            if clients[c].target[t][0] != .0 and clients[c].target[t][1] != .0 \
                    and clients[c].best_dot[t][clients[c].best_model[t]] is not None \
                    and clients[c].best_dot[t][clients[c].best_model[t]].coords[0] != .0 \
                    and clients[c].best_dot[t][clients[c].best_model[t]].coords[1] != .0 \
                    and client3.distance_ss(clients[c].target[t],
                                            clients[c].best_dot[t][clients[c].best_model[t]].coords) < 0.02:
                fold_matches += 1
            elif clients[c].best_dot[t][clients[c].best_model[t]] is None:
                if clients[c].best_dot[t][0] is not None \
                        and clients[c].best_dot[t][0] != .0 \
                        and clients[c].best_dot[t][0] != .0 \
                        and client3.distance_ss(clients[c].target[t],
                                                clients[c].best_dot[t][0].coords) < 0.02:
                    fold_matches += 1
                none_count += 1

    print("Matched:", fold_matches)
    print("None:", none_count)

    # test part
    clients = client3.load_clients(TEST_CLIENTS_PICKLE, TEST_ROWS_PICKLE,
                                   TEST_CSV)
    #clients = client3.load_clients(TEST_CLIENTS_PICKLE+"_1000", TEST_ROWS_PICKLE, TEST_CSV)
    clients, t = client3.fetch(
        clients[:1000],
        _targets=targets,
        max_factor=MAX_FACTOR,
        clients_pickle_file=TEST_CLIENTS_PICKLE,  #+"_1000",
        rows_pickle_file=TEST_ROWS_PICKLE,
        csv_file=TEST_CSV,
        parallel=True)
    #
    # client3.plot_client_dots_features(clients)
    # client3.plot_all_works(clients)
    # client3.plot_all_homes(clients)

    for t in range(2):
        for i in range(MAX_FACTOR):
            predictor3.predict_dots(clients,
                                    target=t,
                                    model=models[t][i],
                                    factor=i + 1)
        predictor3.predict_selector(clients, target=t, model=selector[t])

    client3.dump(clients, "final_test_clients.pickle")
    save_solution_to_csv("test_solution_last_last", clients)
    return
Ejemplo n.º 5
0
def load_clients_and_save_solution():
    train_clients = client3.load_clients("all_factors_train_clients.pickle",
                                         TRAIN_ROWS_PICKLE, TRAIN_CSV)
    clients = client3.load("final_test_clients.pickle")

    print(clients[0].get_data())
    print(clients[0].str_best())

    models = [[None for i in range(MAX_FACTOR)] for t in range(2)]
    selector = [None for t in range(2)]

    for t in range(2):
        for i in range(MAX_FACTOR):
            models[t][i] = cb.CatBoostClassifier(
                iterations=2000,
                depth=4,
                learning_rate=0.04,
                # custom_loss=['Recall', 'Precision', 'Accuracy'],
                # loss_function='Logloss',
                random_seed=4242,
                use_best_model=True,
                od_type='Iter',
                od_wait=500,
                # eval_metric='Logloss',
                # task_type='GPU',
                logging_level='Verbose')
        selector[t] = cb.CatBoostClassifier(
            iterations=10,
            depth=4,
            learning_rate=0.1,
            # custom_loss=['Recall', 'Precision', 'Accuracy'],
            # loss_function='Logloss',
            loss_function='MultiClass',
            classes_count=8,
            random_seed=4242,
            use_best_model=True,
            od_type='Iter',
            od_wait=500,
            # eval_metric='Logloss',
            # task_type='GPU',
            logging_level='Verbose')

    for t in range(2):
        for i in range(MAX_FACTOR):
            if os.path.isfile("dots_model_" + str(t) + "_" + str(i)):
                models[t][i].load_model(fname="dots_model_" + str(t) + "_" +
                                        str(i))

    for t in range(2):
        if os.path.isfile("selector_" + str(t)):
            selector[t].load_model(fname="selector3_" + str(t))

    for t in range(2):
        for i in range(MAX_FACTOR):
            predictor3.predict_dots(train_clients,
                                    target=t,
                                    model=models[t][i],
                                    factor=i + 1)
        predictor3.predict_selector(train_clients, target=t, model=selector[t])
    #
    # client3.dump(train_clients, "all_factors_train_clients.pickle")

    # for t in range(2):
    #     selector[t] = predictor3.train_selector(train_clients, _model=selector[t], target=t)
    #     selector[t].save_model("selector3_" + str(t), format="cbm")

    for t in range(2):
        for i in range(MAX_FACTOR):
            predictor3.predict_dots(clients,
                                    target=t,
                                    model=models[t][i],
                                    factor=i + 1)
        predictor3.predict_selector(clients, target=t, model=selector[t])

    print(clients[0].get_data())
    print(clients[0].str_best())

    #client3.dump(train_clients, "final_train_clients3.pickle")
    #client3.dump(clients, "final_test_clients3.pickle")

    #save_solution_to_csv("test_solution_last", clients)

    print("==================== ============ ======================")

    for t in range(2):
        for c in range(len(train_clients)):
            train_clients[c].best_model[t] = 0
            # train_clients[c].best_model_proba[t] = train_clients[c].best_dot_proba[t][0]
            # for f in range(1, MAX_FACTOR):
            #     if train_clients[c].best_dot_proba[t][f] < train_clients[c].best_model_proba[t]:
            #         train_clients[c].best_model[t] = f
            #         train_clients[c].best_model_proba[t] = train_clients[c].best_dot_proba[t][f]

    fold_matches = 0
    none_count = 0
    for t in range(2):
        for c in range(len(train_clients)):
            if train_clients[c].target[t][0] != .0 and train_clients[c].target[t][1] != .0 \
                    and train_clients[c].best_dot[t][train_clients[c].best_model[t]] is not None \
                    and train_clients[c].best_dot[t][train_clients[c].best_model[t]].coords[0] != .0 \
                    and train_clients[c].best_dot[t][train_clients[c].best_model[t]].coords[1] != .0 \
                    and client3.distance_ss(train_clients[c].target[t], train_clients[c].best_dot[t][train_clients[c].best_model[t]].coords) < 0.02:
                fold_matches += 1
            elif train_clients[c].best_dot[t][
                    train_clients[c].best_model[t]] is None:
                if train_clients[c].best_dot[t][0] is not None \
                        and train_clients[c].best_dot[t][0] != .0 \
                        and train_clients[c].best_dot[t][0] != .0 \
                        and client3.distance_ss(train_clients[c].target[t], train_clients[c].best_dot[t][0].coords) < 0.02:
                    fold_matches += 1
                none_count += 1

    print("Matched:", fold_matches)
    print("None:", none_count)

    for t in range(2):
        for c in range(len(clients)):
            clients[c].best_model[t] = 0
            # clients[c].best_model_proba[t] = clients[c].best_dot_proba[t][0]
            # for f in range(1, MAX_FACTOR):
            #     if clients[c].best_dot_proba[t][f] < clients[c].best_model_proba[t]:
            #         clients[c].best_model[t] = f
            #         clients[c].best_model_proba[t] = clients[c].best_dot_proba[t][f]

    save_solution_to_csv("test_solution_lasT_last", clients)