def main(to_show: bool = True, to_save: bool = False, normalize=True, standardize=True): print('norm: {}'.format(normalize)) print('stand: {}'.format(standardize)) # os.mkdir('plot') positive_data_path = 'data/all_positive_X.txt' negative_data_path = 'data/all_negative_X.txt' # data: Tuple[np.ndarray, np.ndarray] = prepare_data(positive_data_path, negative_data_path, normalize=normalize, # standardize=standardize) # # X = data[0] # y = data[1] # X, y = load_data() # pca_3(X, y, to_show) # pca_2(X, y, to_show=to_show, to_save=to_save) # pca_2_incremental(X, y, to_show=to_show, to_save=to_save) # t_sne_2(X, y, to_show=to_show, to_save=to_save) index = 1 # n_neighbors = 200 # d = 0.8 # m = "euclidean" n_neighbors = 15 d = 0.1 m = "euclidean" ts = time.time() print('iteration: {}'.format(index)) data: Tuple[np.ndarray, np.ndarray] = prepare_data(positive_data_path, negative_data_path, normalize=normalize, standardize=standardize) X = data[0] y = data[1] print(' << data ready') name = m if type(m) is str else m.__name__ # title = 'UMAP_{}_{}_{}'.format(name, n_neighbors, d) title = 'UMAP - Unsafe and Legitimate websites'.format( name, n_neighbors, d) umap_2(X, y, title=title, to_show=to_show, to_save=to_save, metric=m, min_dist=d, n_neighbors=n_neighbors) print(' << done in {}'.format(time.time() - ts))
def get_model(i: int): models = [XGBClassifier(), RandomForestClassifier()] return models[i] def ratio_of_pos_and_neg(y_train: np.ndarray): positive_samples_count = np.sum(y_train) negative_samples_count = y_train.shape[0] - np.sum(y_train) assert y_train.shape[0] == positive_samples_count + negative_samples_count return positive_samples_count, negative_samples_count positive_data_path = 'data/train_positive_X.txt' negative_data_path = 'data/train_negative_X.txt' data: Tuple[np.ndarray, np.ndarray] = prepare_data(positive_data_path, negative_data_path, normalize=False, standardize=False) X = data[0] y = data[1] positive = y.sum() negative = y.shape[0] - positive print(positive) print(negative) idx = np.random.RandomState(seed=11).permutation(X.shape[0]) # idx = np.random.permutation(X.shape[0]) X, y = X[idx], y[idx] # print(X[0])
log_file_name = 'log.txt' log_file = open(result_folder + '/' + log_file_name, 'a') def __print(text: str): log_file.write(text + '\n') print(text) features_and_indexes = get_feature_names_and_indexes() train_positive_data_path = 'data/train_positive_X.txt' train_negative_data_path = 'data/train_negative_X.txt' data: Tuple[np.ndarray, np.ndarray] = prepare_data(train_positive_data_path, train_negative_data_path, normalize=False, standardize=False) X_train = data[0] y_train = data[1] print('training data: {}'.format(X_train.shape)) test_positive_data_path = 'data/test_positive_X.txt' test_negative_data_path = 'data/test_negative_X.txt' data: Tuple[np.ndarray, np.ndarray] = prepare_data(test_positive_data_path, test_negative_data_path, normalize=False, standardize=False) X_test = data[0] y_test = data[1] print('testing data: {}'.format(X_test.shape))