def main(to_show: bool = True,
         to_save: bool = False,
         normalize=True,
         standardize=True):
    print('norm: {}'.format(normalize))
    print('stand: {}'.format(standardize))

    # os.mkdir('plot')

    positive_data_path = 'data/all_positive_X.txt'
    negative_data_path = 'data/all_negative_X.txt'
    # data: Tuple[np.ndarray, np.ndarray] = prepare_data(positive_data_path, negative_data_path, normalize=normalize,
    #                                                    standardize=standardize)
    #
    # X = data[0]
    # y = data[1]

    # X, y = load_data()

    # pca_3(X, y, to_show)
    # pca_2(X, y, to_show=to_show, to_save=to_save)
    # pca_2_incremental(X, y, to_show=to_show, to_save=to_save)
    # t_sne_2(X, y, to_show=to_show, to_save=to_save)
    index = 1

    # n_neighbors = 200
    # d = 0.8
    # m = "euclidean"

    n_neighbors = 15
    d = 0.1
    m = "euclidean"

    ts = time.time()
    print('iteration: {}'.format(index))
    data: Tuple[np.ndarray, np.ndarray] = prepare_data(positive_data_path,
                                                       negative_data_path,
                                                       normalize=normalize,
                                                       standardize=standardize)

    X = data[0]
    y = data[1]
    print('     << data ready')
    name = m if type(m) is str else m.__name__
    # title = 'UMAP_{}_{}_{}'.format(name, n_neighbors, d)
    title = 'UMAP - Unsafe and Legitimate websites'.format(
        name, n_neighbors, d)
    umap_2(X,
           y,
           title=title,
           to_show=to_show,
           to_save=to_save,
           metric=m,
           min_dist=d,
           n_neighbors=n_neighbors)
    print('     << done in {}'.format(time.time() - ts))
Example #2
0
def get_model(i: int):
    models = [XGBClassifier(), RandomForestClassifier()]
    return models[i]


def ratio_of_pos_and_neg(y_train: np.ndarray):
    positive_samples_count = np.sum(y_train)
    negative_samples_count = y_train.shape[0] - np.sum(y_train)
    assert y_train.shape[0] == positive_samples_count + negative_samples_count
    return positive_samples_count, negative_samples_count


positive_data_path = 'data/train_positive_X.txt'
negative_data_path = 'data/train_negative_X.txt'

data: Tuple[np.ndarray, np.ndarray] = prepare_data(positive_data_path, negative_data_path, normalize=False, standardize=False)
X = data[0]
y = data[1]

positive = y.sum()
negative = y.shape[0] - positive
print(positive)
print(negative)
idx = np.random.RandomState(seed=11).permutation(X.shape[0])
# idx = np.random.permutation(X.shape[0])
X, y = X[idx], y[idx]




# print(X[0])
log_file_name = 'log.txt'
log_file = open(result_folder + '/' + log_file_name, 'a')


def __print(text: str):
    log_file.write(text + '\n')
    print(text)


features_and_indexes = get_feature_names_and_indexes()

train_positive_data_path = 'data/train_positive_X.txt'
train_negative_data_path = 'data/train_negative_X.txt'
data: Tuple[np.ndarray, np.ndarray] = prepare_data(train_positive_data_path,
                                                   train_negative_data_path,
                                                   normalize=False,
                                                   standardize=False)
X_train = data[0]
y_train = data[1]
print('training data: {}'.format(X_train.shape))

test_positive_data_path = 'data/test_positive_X.txt'
test_negative_data_path = 'data/test_negative_X.txt'
data: Tuple[np.ndarray, np.ndarray] = prepare_data(test_positive_data_path,
                                                   test_negative_data_path,
                                                   normalize=False,
                                                   standardize=False)
X_test = data[0]
y_test = data[1]
print('testing data: {}'.format(X_test.shape))