コード例 #1
0
def label_data_base(data_base_file: str, heuristics: bool):
    label_extractor = LabelExtractor()

    labeled_data_base, heuristic, unknown, unlabeled, unlabeled_hashes = label_extractor.get_labeled_data_base(
        data_base_file=get_data_base_path(data_base_file),
        heuristics=heuristics)

    data_base_id = get_file_id(config.data_base_path)
    data_base_name = data_base_file.split('.')[0]

    if len(labeled_data_base) > 0:
        drop_duplicates(labeled_data_base)
        labeled_data_base.to_csv(path_or_buf=join(
            config.data_base_path,
            f'{data_base_name}__labeled_{data_base_id:_>3}_heur{heuristics}.csv'
        ),
                                 index=False)

        print('labeled')
        data_base_info(labeled_data_base, printable=True)

    if not heuristics and len(heuristic) > 0:
        drop_duplicates(heuristic)
        heuristic.to_csv(path_or_buf=join(
            config.data_base_path,
            f'{data_base_name}__heuristic_{data_base_id:_>3}_heur{heuristics}.csv'
        ),
                         index=False)

        print('heuristic')
        data_base_info(heuristic, printable=True)

    if len(unknown) > 0:
        drop_duplicates(unknown)
        unknown.to_csv(path_or_buf=join(
            config.data_base_path,
            f'{data_base_name}__unknown_{data_base_id:_>3}_heur{heuristics}.csv'
        ),
                       index=False)

        print('unknown')
        data_base_info(unknown, printable=True)

    if len(unlabeled) > 0:
        drop_duplicates(unlabeled)
        unlabeled.to_csv(path_or_buf=join(
            config.data_base_path,
            f'{data_base_name}__unlabeled_{data_base_id:_>3}_heur{heuristics}.csv'
        ),
                         index=False)

        print('unlabeled')
        data_base_info(unlabeled, printable=True)

    if len(unlabeled_hashes) > 0:
        param_id = get_file_id(config.parameters_path)
        save_obj(unlabeled_hashes,
                 filename=f'unlabeled_hashes_{data_base_name}_{param_id:_>3}')
コード例 #2
0
def create_data_base(paths: tuple = (config.reports_path,
                                     config.info_lack_reports_path),
                     debug: bool = False,
                     process_count: int = 12) -> None:
    if debug:
        report_paths = get_debug_report_paths()
    else:
        report_paths = get_report_paths(paths)

    manager = Manager()
    report_paths = SharedReportPaths(report_paths, manager)
    reports_features = SharedReportsFeatures(manager)

    ps = [
        Process(target=parsing_reports, args=(report_paths, reports_features))
        for _ in range(process_count)
    ]

    for p in ps:
        p.start()

    for p in ps:
        p.join()

    reports_features = reports_features.get_reports_features()

    data_base = pd.DataFrame(reports_features)

    data_base_info(data_base)

    data_base_id = get_file_id(config.data_base_path)
    data_base.to_csv(path_or_buf=join(config.data_base_path,
                                      f'database_{data_base_id:_>3}.csv'),
                     index=False)
コード例 #3
0
def probabilities_distribution_visualization(correct: dict, incorrect: dict,
                                             incorrect_target: dict,
                                             training_space: dict) -> None:
    def get_ratio(dist: dict) -> np.ndarray:
        n = sum(dist.values())
        ratio = np.array(list(dist.values()))

        return ratio / n

    thresholds = [f'{lbl + 0.1:.1f}-{lbl:.1f}' for lbl in correct.keys()]
    x_ticks = np.arange(0, len(thresholds))
    y_ticks = np.arange(0, 1, 0.1)
    y_labels = [f'{int(y_tick * 100)}%' for y_tick in y_ticks]
    correct_ratio = get_ratio(correct)
    incorrect_ratio = get_ratio(incorrect)
    incorrect_target_ratio = get_ratio(incorrect_target)
    width = 0.2

    fig, ax = plt.subplots(figsize=(19.2, 10.8))

    ax.bar(x_ticks,
           correct_ratio,
           width,
           color='lightseagreen',
           label='True Positive')
    ax.bar(x_ticks + width,
           incorrect_ratio,
           width,
           color='lightcoral',
           label='False Positive + False Negative')
    ax.bar(x_ticks + width * 2,
           incorrect_target_ratio,
           width,
           color='mediumslateblue',
           label='Target among wrong cases')

    ax.set_title(f"Classifier prediction ratio")
    ax.set_ylabel('Percent')
    ax.set_xlabel('Thresholds')
    ax.set_xticks(x_ticks + ((width * 3) / 3))
    ax.set_xticklabels(thresholds)
    ax.set_yticks(y_ticks)
    ax.set_yticklabels(y_labels)
    ax.set_ybound(lower=0)
    ax.legend()

    fig.tight_layout()

    file_id = get_file_id(training_space['performance'])

    plt.savefig(join(training_space['performance'],
                     f'{training_space["mark"]}__{file_id}.png'),
                dpi='figure',
                format='png',
                bbox_inches='tight')
    plt.clf()
    plt.close('all')
コード例 #4
0
ファイル: CVAE.py プロジェクト: alex-snd/MalwareClassifier
def generate_data_base(model: nn.Module,
                       device: torch.device,
                       each_class: int = 5000,
                       batch_size: int = 512) -> None:
    def batch_decoding(_batch_size: int) -> None:
        batch_z = torch.randn((_batch_size, model.d_in),
                              dtype=torch.float32,
                              device=device)
        batch_labels = torch.stack([
            labels[label],
        ] * _batch_size).type(torch.float32).to(device)

        _, _, batch_data = model(batch_z, batch_labels)

        data.append(torch.cat([mark, batch_data.detach().cpu()], dim=1))

    torch.manual_seed(2531)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    last_batch_size = each_class % batch_size
    n_classes = 8
    labels = torch.eye(n_classes, n_classes)

    model.eval()

    training_space = create_training_space(model_name='CVAE', evaluate=True)
    model.load_parameters(
        get_check_point_parameters_path(training_space['check_point']))

    data = list()

    for label in range(n_classes):

        mark = torch.zeros((batch_size, 2)).fill_(label)
        for _ in range(each_class // batch_size):
            batch_decoding(batch_size)

        mark = torch.zeros((last_batch_size, 2)).fill_(label)
        if last_batch_size != 0:
            batch_decoding(last_batch_size)

    data = torch.cat(data, dim=0).numpy()
    data_base = pd.DataFrame(data)

    data_base_id = get_file_id(config.data_base_path)
    data_base.to_csv(path_or_buf=join(
        config.data_base_path, f'artificial_database_{data_base_id:_>3}.csv'),
                     index=False)
コード例 #5
0
def silhouette_analysis(data: pd.DataFrame, prediction: np.ndarray, n_clusters: int, model_name: str) -> None:
    cluster_labels = np.unique(prediction)

    if n_clusters == 1:
        print('Count of clusters need to be more then 1')
        return

    silhouette_values = silhouette_samples(data, prediction, metric='euclidean')

    y_ax_lower, y_ax_upper = 0, 0
    y_ticks = []

    rainbow = cm.get_cmap('rainbow', n_clusters)
    colors = rainbow(np.linspace(0, 1, n_clusters))

    plt.figure(figsize=(19.2, 10.8))

    for i, c in enumerate(cluster_labels):
        c_silhouette_values = silhouette_values[prediction == c]
        c_silhouette_values.sort()

        y_ax_upper += len(c_silhouette_values)

        plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_values, height=1.0,
                 edgecolor='none', color=colors[i])

        y_ticks.append((y_ax_lower + y_ax_upper) / 2.)
        y_ax_lower += len(c_silhouette_values)

    silhouette_avg = np.mean(silhouette_values)

    plt.axvline(silhouette_avg, color="red", linestyle="--")
    plt.yticks(y_ticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')

    plt.tight_layout()

    im_id = get_file_id(config.simple_analysis_path)
    plt.savefig(join(config.simple_analysis_path, f'silhouette_{n_clusters}_clusters_{model_name}_{im_id:_>3}.png'),
                dpi='figure', bbox_inches='tight')

    plt.clf()
    plt.close('all')
コード例 #6
0
def cleaning(reports_path: tuple = (config.reports_path, ),
             debug: bool = False,
             process_count: int = 12,
             seen_hashes_file: str = None):
    manager = Manager()

    if debug:
        report_paths = manager.list(get_debug_report_paths())
    else:
        report_paths = manager.list(get_report_paths(paths=reports_path))

    data = manager.dict()

    lock_report_paths = Lock()
    lock_data = Lock()

    ps = [
        Process(target=report_cleaning,
                args=(report_paths, data, lock_report_paths, lock_data))
        for _ in range(process_count)
    ]
    for p in ps:
        p.start()

    for p in ps:
        p.join()

    if seen_hashes_file:
        hashes = get_seen_hashes(seen_hashes_file)
    else:
        hashes = dict()

    for i_report in data.keys():

        i_hash = data[i_report]

        if i_hash in hashes:
            move_to_duplicates(i_report, hashes[i_hash])
        else:
            hashes.setdefault(i_hash, i_report)

    hash_id = get_file_id(config.parameters_path)
    save_obj(hashes, join(config.parameters_path, f'hashes_{hash_id:_>3}'))
コード例 #7
0
def scale_data_base(data_base_file: str, scale, scale_type: str) -> None:
    data_base_id = get_file_id(config.data_base_path)
    data_base_name = data_base_file.split('.')[0]
    data_base = get_data_base(data_base_file)

    data_base_labels = data_base[data_base.columns[:2]]
    data_base = data_base.drop(data_base.columns[:2], axis=1)

    data_base_std = pd.DataFrame(scale.fit_transform(data_base),
                                 columns=data_base.columns)

    save_obj(
        scale,
        filename=f'{scale_type}_scale_{data_base_name}_{data_base_id:_>3}')

    data_base = pd.concat([data_base_labels, data_base_std], axis=1)

    data_base.to_csv(path_or_buf=join(
        config.data_base_path,
        f'{data_base_name}_{scale_type}_{data_base_id:_>3}.csv'),
                     index=False)
コード例 #8
0
def scatter_prediction(data: pd.DataFrame, prediction: np.ndarray, n_clusters: int, model_name: str) -> None:
    dots = data.to_numpy()

    color_len = prediction.max(initial=1) / n_clusters
    ticks = list(np.arange(color_len / 2, n_clusters * color_len, color_len))

    color_map = cm.get_cmap('rainbow', n_clusters)

    fig, ax = plt.subplots(figsize=(19.2, 10.8))
    scatter = ax.scatter(dots[:, 0], dots[:, 1], c=prediction, cmap=color_map, marker='o', alpha=1.0)
    color_bar = fig.colorbar(scatter, ticks=ticks, ax=ax, drawedges=True)
    color_bar.ax.set_yticklabels([t + 1 for t in range(n_clusters)])
    ax.set_title(f'Latent space')
    fig.tight_layout()

    im_id = get_file_id(config.simple_analysis_path)
    plt.savefig(join(config.simple_analysis_path, f'scatter_{n_clusters}_clusters_{model_name}_{im_id:_>3}.png'),
                dpi='figure', format='png', bbox_inches='tight')

    plt.clf()
    plt.close('all')
コード例 #9
0
def elbow_method(data: pd.DataFrame, max_n_cluster: int = 20) -> None:
    distortions = []

    for i in range(1, max_n_cluster):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=15,
                    max_iter=300,
                    random_state=2531)

        km.fit(data)

        distortions.append(km.inertia_)

    plt.figure(figsize=(19.2, 10.8))
    plt.plot(range(1, max_n_cluster), distortions, marker='o', color='royalblue')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.tight_layout()

    im_id = get_file_id(config.simple_analysis_path)
    plt.savefig(join(config.simple_analysis_path, f'elbow_{im_id:_>3}.png'), dpi='figure', bbox_inches='tight')