def predict_dots(clients, target, model, factor=1): """Predict targets for clients using model and specified dot factor""" print("Predicting target[", target, "] using factor:", factor) features = clients[0].dots[factor - 1][0].get_cat_features() train_data = [] train_labels = [[], []] dot_clients = [] dot_dot = [] for c in range(len(clients)): client = clients[c] client.predicted_target[target] = [.0, .0] client.predicted_target_probability[target] = .0 client.best_dot[target][factor - 1] = None client.best_dot_proba[target][factor - 1] = .0 for dot in client.dots[factor - 1]: data, labels = dot.get_data() train_data.append(data) train_labels[0].append(labels[0]) train_labels[1].append(labels[1]) dot_clients.append(client) dot_dot.append(dot) train_pool = cb.Pool(np.array(train_data), np.array(train_labels[target]), features) results = model.predict_proba(train_pool) for i in range(len(results)): if dot_clients[i].predicted_target_probability[target] < results[i][1]: dot_clients[i].predicted_target_probability[target] = results[i][1] dot_clients[i].predicted_target[target] = [ float(train_data[i][0]), float(train_data[i][1]) ] #demap_coord([train_data[i][0], train_data[i][1]]) dot_clients[i].best_dot[target][factor - 1] = dot_dot[i] dot_clients[i].best_dot_proba[target][factor - 1] = results[i][1] fold_matches = 0.0 for c in range(len(clients)): client = clients[c] if client.target[target][0] != .0 and client.target[target][1] != .0 and \ client.predicted_target[target][0] != .0 and client.predicted_target[target][1] != .0 and \ distance_ss(client.target[target], client.predicted_target[target]) < 0.02: fold_matches += 1 print("Result on all clients:", fold_matches / len(clients)) print("Feature importance", model.get_feature_importance(train_pool))
def predict_selector(clients, target, model): """Predict using selector for targets for clients using model and specified dot factor""" print("Predicting selector for target[", target, "] ") features = clients[0].get_cat_features() train_data = [] train_labels = [[], []] dot_clients = [] for c in range(len(clients)): client = clients[c] client.predicted_target[target] = [.0, .0] client.predicted_target_probability[target] = .0 has_ndots = True for fctr in range(1, len(client.dots)): if len(client.dots[fctr]) == 0: has_ndots = has_ndots and False if has_ndots: data, labels = client.get_data() train_data.append(data) train_labels[0].append(labels[0]) train_labels[1].append(labels[1]) dot_clients.append(client) train_pool = cb.Pool(np.array(train_data), np.array(train_labels[target]), features) results = model.predict(train_pool) for i in range(len(results)): dot_clients[i].best_model[target] = dot_clients[ i].best_model_from_label(target, results[i][0]) fold_matches = 0.0 for c in range(len(clients)): client = clients[c] if client.target[target][0] != .0 and client.target[target][1] != .0 and \ client.best_dot[target][client.best_model[target]].coords[0] != .0 and client.best_dot[target][client.best_model[target]].coords[1] != .0 and \ distance_ss(client.target[target], client.best_dot[target][client.best_model[target]].coords) < 0.02: fold_matches += 1 print("Result on all clients:", fold_matches / len(clients)) print("Feature importance", model.get_feature_importance(train_pool))
def train_dots(clients, target, factor=1, _model=None, fold_num=5): print("Training target[", target, "] using factor:", factor) folds = [[[] for i in range(fold_num)] for j in range(fold_num)] clients_num = len(clients) model = _model if model == None: model = cb.CatBoostClassifier( iterations=100, depth=4, learning_rate=0.04, #custom_loss=['Recall', 'Precision', 'Accuracy'], #loss_function='Logloss', random_seed=4242, use_best_model=True, #eval_metric='Logloss', #task_type='GPU', logging_level='Verbose') features = clients[0].dots[factor - 1][0].get_cat_features() for c in range(len(clients)): client = clients[c] client.predicted_target[target] = [.0, .0] client.predicted_target_probability[target] = .0 if fold_num > 0: sub_fold_num = clients_num // fold_num for i in range(fold_num): for j in range(fold_num): print( i * clients_num // fold_num + 0 + j * sub_fold_num // fold_num, i * clients_num // fold_num + (1 + j) * sub_fold_num // fold_num) folds[i][j] = clients[i * clients_num // fold_num + 0 + j * sub_fold_num // fold_num:i * clients_num // fold_num + (1 + j) * sub_fold_num // fold_num] total_results = [] for f in range(len(folds)): fold_results = [] for s in range(len(folds[f])): train_data = [] train_labels = [[], []] eval_data = [] eval_labels = [[], []] dot_clients = [] for c in range(len(folds[f][s])): client = folds[f][s][c] if client.target[target][0] != .0 and client.target[ target][1] != .0: for dot in client.dots[factor - 1]: data, labels = dot.get_data() eval_data.append(data) eval_labels[0].append(labels[0]) eval_labels[1].append(labels[1]) dot_clients.append(client) for t in range(len(folds[f])): if t == s: continue for c in range(len(folds[f][t])): client = folds[f][t][c] if client.target[target][0] != .0 and client.target[ target][1] != .0: for dot in client.dots[factor - 1]: data, labels = dot.get_data() train_data.append(data) train_labels[0].append(labels[0]) train_labels[1].append(labels[1]) fold_train_pool = cb.Pool(np.array(train_data), np.array(train_labels[target]), features) fold_eval_pool = cb.Pool(np.array(eval_data), np.array(eval_labels[target]), features) model.fit(fold_train_pool, eval_set=fold_eval_pool) results = model.predict_proba(fold_eval_pool) for i in range(len(results)): if dot_clients[i].predicted_target_probability[ target] < results[i][1]: dot_clients[i].predicted_target_probability[ target] = results[i][1] dot_clients[i].predicted_target[target] = [ float(eval_data[i][0]), float(eval_data[i][1]) ] #demap_coord([eval_data[i][0], eval_data[i][1]]) fold_matches = 0.0 for c in range(len(folds[f][s])): client = folds[f][s][c] if client.target[target][0] != .0 and client.target[target][1] != .0 and \ client.predicted_target[target][0] != .0 and client.predicted_target[target][1] != .0 and \ distance_ss(client.target[target], client.predicted_target[target]) < 0.02: fold_matches += 1 fold_results.append(fold_matches / len(folds[f][s])) print("Train subfold results:", fold_results) total = 0.0 for s in range(len(folds[f])): total += fold_results[s] total_results.append(total / len(folds[f])) print("Train fold results:", total_results) res = 0.0 for i in total_results: res += i total_results = res / len(folds) print("Train total result:", total_results) predict_dots(clients, target, model, factor=factor) return model
def main3(): random.seed(4242) clients = client3.load_clients(TRAIN_CLIENTS_PICKLE, TRAIN_ROWS_PICKLE, TRAIN_CSV) #clients = client3.load_clients(TRAIN_CLIENTS_PICKLE+"_1000", TRAIN_ROWS_PICKLE, TRAIN_CSV) clients, targets = client3.fetch( cls=clients[:5000], _targets=None, max_factor=MAX_FACTOR, clients_pickle_file=TRAIN_CLIENTS_PICKLE, #+"_1000", rows_pickle_file=TRAIN_ROWS_PICKLE, csv_file=TRAIN_CSV, parallel=True) #client3.plot_client_dots_features(clients) models = [[None for i in range(MAX_FACTOR)] for t in range(2)] selector = [None for t in range(2)] for t in range(2): for i in range(MAX_FACTOR): models[t][i] = cb.CatBoostClassifier( iterations=2000, depth=4, learning_rate=0.05, # custom_loss=['Recall', 'Precision', 'Accuracy'], # loss_function='Logloss', random_seed=4242, use_best_model=True, od_type='Iter', od_wait=500, # eval_metric='Logloss', # task_type='GPU', #logging_level='Verbose', logging_level='Silent') selector[t] = cb.CatBoostClassifier( iterations=1000, depth=4, learning_rate=0.05, # custom_loss=['Recall', 'Precision', 'Accuracy'], # loss_function='Logloss', loss_function='MultiClass', classes_count=32, random_seed=4242, use_best_model=True, od_type='Iter', od_wait=500, # eval_metric='Logloss', # task_type='GPU', logging_level='Verbose', #logging_level='Silent' ) for t in range(2): for i in range(MAX_FACTOR): if os.path.isfile("dots_model_" + str(t) + "_" + str(i)): models[t][i].load_model(fname="dots_model_" + str(t) + "_" + str(i)) else: models[t][i] = predictor3.train_dots(clients, _model=models[t][i], target=t, factor=i + 1) models[t][i].save_model("dots_model_" + str(t) + "_" + str(i), format="cbm") for t in range(2): for i in range(MAX_FACTOR): predictor3.predict_dots(clients, target=t, model=models[t][i], factor=i + 1) # client3.plot_best_dot_probabilities(clients) # sys.exit() for t in range(2): if os.path.isfile("selector_" + str(t)): selector[t].load_model(fname="selector_" + str(t)) else: selector[t] = predictor3.train_selector(clients, _model=selector[t], target=t) selector[t].save_model("selector_" + str(t), format="cbm") for t in range(2): for i in range(MAX_FACTOR): predictor3.predict_dots(clients, target=t, model=models[t][i], factor=i + 1) predictor3.predict_selector(clients, target=t, model=selector[t]) fold_matches = 0 none_count = 0 for t in range(2): for c in range(len(clients)): if clients[c].target[t][0] != .0 and clients[c].target[t][1] != .0 \ and clients[c].best_dot[t][clients[c].best_model[t]] is not None \ and clients[c].best_dot[t][clients[c].best_model[t]].coords[0] != .0 \ and clients[c].best_dot[t][clients[c].best_model[t]].coords[1] != .0 \ and client3.distance_ss(clients[c].target[t], clients[c].best_dot[t][clients[c].best_model[t]].coords) < 0.02: fold_matches += 1 elif clients[c].best_dot[t][clients[c].best_model[t]] is None: if clients[c].best_dot[t][0] is not None \ and clients[c].best_dot[t][0] != .0 \ and clients[c].best_dot[t][0] != .0 \ and client3.distance_ss(clients[c].target[t], clients[c].best_dot[t][0].coords) < 0.02: fold_matches += 1 none_count += 1 print("Matched:", fold_matches) print("None:", none_count) # test part clients = client3.load_clients(TEST_CLIENTS_PICKLE, TEST_ROWS_PICKLE, TEST_CSV) #clients = client3.load_clients(TEST_CLIENTS_PICKLE+"_1000", TEST_ROWS_PICKLE, TEST_CSV) clients, t = client3.fetch( clients[:1000], _targets=targets, max_factor=MAX_FACTOR, clients_pickle_file=TEST_CLIENTS_PICKLE, #+"_1000", rows_pickle_file=TEST_ROWS_PICKLE, csv_file=TEST_CSV, parallel=True) # # client3.plot_client_dots_features(clients) # client3.plot_all_works(clients) # client3.plot_all_homes(clients) for t in range(2): for i in range(MAX_FACTOR): predictor3.predict_dots(clients, target=t, model=models[t][i], factor=i + 1) predictor3.predict_selector(clients, target=t, model=selector[t]) client3.dump(clients, "final_test_clients.pickle") save_solution_to_csv("test_solution_last_last", clients) return
def load_clients_and_save_solution(): train_clients = client3.load_clients("all_factors_train_clients.pickle", TRAIN_ROWS_PICKLE, TRAIN_CSV) clients = client3.load("final_test_clients.pickle") print(clients[0].get_data()) print(clients[0].str_best()) models = [[None for i in range(MAX_FACTOR)] for t in range(2)] selector = [None for t in range(2)] for t in range(2): for i in range(MAX_FACTOR): models[t][i] = cb.CatBoostClassifier( iterations=2000, depth=4, learning_rate=0.04, # custom_loss=['Recall', 'Precision', 'Accuracy'], # loss_function='Logloss', random_seed=4242, use_best_model=True, od_type='Iter', od_wait=500, # eval_metric='Logloss', # task_type='GPU', logging_level='Verbose') selector[t] = cb.CatBoostClassifier( iterations=10, depth=4, learning_rate=0.1, # custom_loss=['Recall', 'Precision', 'Accuracy'], # loss_function='Logloss', loss_function='MultiClass', classes_count=8, random_seed=4242, use_best_model=True, od_type='Iter', od_wait=500, # eval_metric='Logloss', # task_type='GPU', logging_level='Verbose') for t in range(2): for i in range(MAX_FACTOR): if os.path.isfile("dots_model_" + str(t) + "_" + str(i)): models[t][i].load_model(fname="dots_model_" + str(t) + "_" + str(i)) for t in range(2): if os.path.isfile("selector_" + str(t)): selector[t].load_model(fname="selector3_" + str(t)) for t in range(2): for i in range(MAX_FACTOR): predictor3.predict_dots(train_clients, target=t, model=models[t][i], factor=i + 1) predictor3.predict_selector(train_clients, target=t, model=selector[t]) # # client3.dump(train_clients, "all_factors_train_clients.pickle") # for t in range(2): # selector[t] = predictor3.train_selector(train_clients, _model=selector[t], target=t) # selector[t].save_model("selector3_" + str(t), format="cbm") for t in range(2): for i in range(MAX_FACTOR): predictor3.predict_dots(clients, target=t, model=models[t][i], factor=i + 1) predictor3.predict_selector(clients, target=t, model=selector[t]) print(clients[0].get_data()) print(clients[0].str_best()) #client3.dump(train_clients, "final_train_clients3.pickle") #client3.dump(clients, "final_test_clients3.pickle") #save_solution_to_csv("test_solution_last", clients) print("==================== ============ ======================") for t in range(2): for c in range(len(train_clients)): train_clients[c].best_model[t] = 0 # train_clients[c].best_model_proba[t] = train_clients[c].best_dot_proba[t][0] # for f in range(1, MAX_FACTOR): # if train_clients[c].best_dot_proba[t][f] < train_clients[c].best_model_proba[t]: # train_clients[c].best_model[t] = f # train_clients[c].best_model_proba[t] = train_clients[c].best_dot_proba[t][f] fold_matches = 0 none_count = 0 for t in range(2): for c in range(len(train_clients)): if train_clients[c].target[t][0] != .0 and train_clients[c].target[t][1] != .0 \ and train_clients[c].best_dot[t][train_clients[c].best_model[t]] is not None \ and train_clients[c].best_dot[t][train_clients[c].best_model[t]].coords[0] != .0 \ and train_clients[c].best_dot[t][train_clients[c].best_model[t]].coords[1] != .0 \ and client3.distance_ss(train_clients[c].target[t], train_clients[c].best_dot[t][train_clients[c].best_model[t]].coords) < 0.02: fold_matches += 1 elif train_clients[c].best_dot[t][ train_clients[c].best_model[t]] is None: if train_clients[c].best_dot[t][0] is not None \ and train_clients[c].best_dot[t][0] != .0 \ and train_clients[c].best_dot[t][0] != .0 \ and client3.distance_ss(train_clients[c].target[t], train_clients[c].best_dot[t][0].coords) < 0.02: fold_matches += 1 none_count += 1 print("Matched:", fold_matches) print("None:", none_count) for t in range(2): for c in range(len(clients)): clients[c].best_model[t] = 0 # clients[c].best_model_proba[t] = clients[c].best_dot_proba[t][0] # for f in range(1, MAX_FACTOR): # if clients[c].best_dot_proba[t][f] < clients[c].best_model_proba[t]: # clients[c].best_model[t] = f # clients[c].best_model_proba[t] = clients[c].best_dot_proba[t][f] save_solution_to_csv("test_solution_lasT_last", clients)